In [1]:
import pandas as pd
import numpy as np

import fasttext
import sklearn

In [2]:
from sklearn.preprocessing import normalize
from scipy.linalg import orthogonal_procrustes
from scipy.spatial.distance import cosine

In [3]:
data = pd.read_csv('./data/utf8-lemma/dems_sents_p23.csv')
data.head()


Unnamed: 0.1,Unnamed: 0,author,out_sent
0,0,[deleted],"['republican', 'house', 'oversight', 'committe..."
1,1,DownToeartgh,"['cbs', 'news', 'asked', 'kevin', 'mccarthy', ..."
2,2,shallah,"['news', 'sander', 'warren', 'colleague', 'int..."
3,3,CommanderKiddie148,"['republican', 'party', 'went', 'crazy', 'pgcc..."
4,4,CQU617,"['governor', 'jacket']"


In [4]:

#write data into correct format
with open('leml.txt', 'w', encoding='utf-8') as f:
    for sentence in data['out_sent']:
        f.write(f"{sentence}\n")

In [5]:
modell = fasttext.train_unsupervised("leml.txt", model='skipgram', dim=50, ws=5, neg=20, epoch=5, minCount=5)
#i would like to make function to track error per epoch


Read 0M words
Number of words:  3412
Number of labels: 0
Progress: 100.0% words/sec/thread:   51585 lr:  0.000000 avg.loss:  3.966599 ETA:   0h 0m 0s


In [7]:
print(modell.get_word_vector('jacket'))

[ 0.06967245 -0.01120306 -0.20773761 -0.12701818  0.00392189  0.17753544
  0.12958306  0.16838895  0.00508853 -0.01979848  0.04173386  0.07022496
  0.00145878 -0.1511321  -0.10871922  0.09608122  0.13124736 -0.08932023
 -0.04405525 -0.02272656  0.07963687  0.13164955  0.11932258 -0.00270827
  0.06483239 -0.03084597  0.14532977  0.09247891 -0.01860495 -0.03120666
 -0.02829671  0.07977466 -0.02912022  0.07839311 -0.02221119  0.0219174
  0.00256984  0.01226886 -0.00843994 -0.02666415 -0.03062296  0.03859283
 -0.00418874  0.09013144 -0.03268562  0.06482032 -0.03967051 -0.08090437
 -0.01468461  0.05332207]


In [9]:
data = pd.read_csv('./data/utf8-lemma/dems_sents_p23.csv')

with open('lemr.txt', 'w', encoding='utf-8') as f:
    for sentence in data['out_sent']:
        f.write(f"{sentence}\n")

modelr = fasttext.train_unsupervised("lemr.txt", model='skipgram', dim=50, ws=5, neg=20, epoch=5, minCount=5)

Read 0M words
Number of words:  3412
Number of labels: 0
Progress: 100.0% words/sec/thread:   60699 lr:  0.000000 avg.loss:  3.940741 ETA:   0h 0m 0s


In [13]:
#find shared vocabulary
vocab = list(set(modell.get_words()) & set(modelr.get_words()))

# Extract vectors for each word in the shared vocabulary
vecd = np.array([modell.get_word_vector(word) for word in vocab])
vecr = np.array([modelr.get_word_vector(word) for word in vocab])

vecd = normalize(vecd, norm='l2')
vecr = normalize(vecr, norm='l2')

In [14]:
#we use the orthogonal procrustes method to align our initial model for r_dems with r_rep
R, _ = orthogonal_procrustes(vecd, vecr)
vecl_aligned = vecd @ R

# compute word cosine similarities
cosine_similarities = np.array([1 - cosine(vec1, vec2) for vec1, vec2 in zip(vecl_aligned, vecr)])

In [15]:
summary_stats = {
    "min": np.min(cosine_similarities),
    "1st_quartile": np.percentile(cosine_similarities, 25),
    "median": np.median(cosine_similarities),
    "mean": np.mean(cosine_similarities),
    "3rd_quartile": np.percentile(cosine_similarities, 75),
    "max": np.max(cosine_similarities)
}

print("Summary Statistics:")
for k, v in summary_stats.items():
    print(f"{k.capitalize()}: {v:.4f}")


Summary Statistics:
Min: 0.9781
1st_quartile: 0.9958
Median: 0.9977
Mean: 0.9969
3rd_quartile: 0.9988
Max: 0.9999


In [16]:
#find most disimilar words
unsim_indices = np.argsort(cosine_similarities)[:10]
unsim_words = [vocab[i] for i in unsim_indices]


for word, score in zip(unsim_words, cosine_similarities[unsim_indices]):
    print(f"Word: {word}, Cosine Similarity: {score}")

Word: 'jean',, Cosine Similarity: 0.9781414866447449
Word: 'bill',, Cosine Similarity: 0.978484034538269
Word: 'jan',, Cosine Similarity: 0.9787474274635315
Word: 'jack',, Cosine Similarity: 0.9788922071456909
Word: 'sen',, Cosine Similarity: 0.9788948893547058
Word: 'carlson'], Cosine Similarity: 0.9789441823959351
Word: 'care'], Cosine Similarity: 0.9794390201568604
Word: 'carroll',, Cosine Similarity: 0.9800364375114441
Word: 'count'], Cosine Similarity: 0.9800666570663452
Word: 'medici',, Cosine Similarity: 0.9804770946502686
