In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [35]:
word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec4.model").wv

In [36]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [37]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('1', 0.9726927876472473),
 ('_', 0.9710683226585388),
 ('5', 0.9659795761108398),
 ('6', 0.9639382362365723),
 ('0', 0.9605770111083984),
 ('8', 0.9574055671691895),
 ('2', 0.9568417072296143),
 ('7', 0.9274511337280273),
 ('9', 0.8446725606918335),
 ('x', 0.7778298258781433)]

In [38]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [39]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [40]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [41]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,n,"[-0.09285517, 0.07088268, 0.03835271, 0.082714...",1,-1,1.805886,-1.805886
1,o,"[0.00068575446, 0.005786211, 0.010378235, 0.05...",1,-1,2.216748,-2.216748
2,t,"[-0.050399292, 0.06851965, -0.025531558, 0.069...",1,-1,1.988305,-1.988305
3,,"[0.028829122, -0.012237509, 0.042112265, 0.017...",1,-1,1.779262,-1.779262
4,m,"[0.013581241, 0.009512896, 0.026526261, 0.0737...",1,-1,1.657712,-1.657712
5,u,"[-0.120608084, 0.040273793, 0.008044605, 0.092...",1,-1,1.623003,-1.623003
6,c,"[-0.11271165, 0.15995622, 0.012432765, 0.13677...",1,-1,1.966052,-1.966052
7,h,"[0.04816323, 0.053908147, 0.023999557, 0.02569...",1,-1,1.958657,-1.958657
8,w,"[0.07316844, -0.06293852, -0.016908227, -0.064...",1,-1,1.740677,-1.740677
9,r,"[-0.016332058, 0.036741607, 0.04591001, 0.0658...",1,-1,1.962069,-1.962069


In [42]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary4.csv', index=False)