In [1]:
import numpy as np
from gensim.models.word2vec import Word2Vec

In [2]:
model_location = './model/'
word2vec = Word2Vec.load(model_location + 'tweet_word2vec.model')

In [3]:
test_words = ['boy', 'girl', 'crack', 'fuck', 'piss', 'finna' ]
for w in test_words:
    print(w,word2vec.wv.similar_by_word(w))
    print()


boy [('girl', 0.5874745845794678), ('kid', 0.49333545565605164), ('boyfriend', 0.4181321859359741), ('baby', 0.4168396294116974), ('broth', 0.4145268201828003), ('nephew', 0.40194082260131836), ('guy', 0.39981508255004883), ('son', 0.3760760426521301), ('cousin', 0.35988542437553406), ('daddy', 0.3350118398666382)]

girl [('boy', 0.5874745845794678), ('lady', 0.4863211512565613), ('wom', 0.43476662039756775), ('guy', 0.4310373067855835), ('kid', 0.4268190860748291), ('sis', 0.41363584995269775), ('gir', 0.39068442583084106), ('sist', 0.3805554509162903), ('baby', 0.36754703521728516), ('besty', 0.35503000020980835)]

crack [('buttcrack', 0.36184078454971313), ('wok', 0.34357941150665283), ('hook', 0.3193165361881256), ('pick', 0.3191421329975128), ('che', 0.31411662697792053), ('wak', 0.30854135751724243), ('shut', 0.2938516139984131), ('chok', 0.2927859425544739), ('fcked', 0.2917778491973877), ('fess', 0.2899385690689087)]

fuck [('fuckin', 0.5401198863983154), ('shit', 0.47455620765

In [4]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter,Scatter3d, Figure, Layout
init_notebook_mode(connected=True)

In [16]:
np.random.seed(6969)
sample_size = 10000
word_vec_dim = 512
words = np.array(list(word2vec.wv.vocab.keys()) ).flatten()
words_to_viz = np.random.choice(words,sample_size)

In [17]:
data_matrix = np.array([word2vec.wv[i].reshape(1,-1) for i in words_to_viz])

In [18]:
data_matrix = data_matrix.reshape(sample_size, word_vec_dim)

In [19]:
data_matrix.shape

(10000, 512)

In [40]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from collections import Counter

In [55]:
pca = PCA(n_components=3, svd_solver='auto', whiten=True)
X_vis = pca.fit_transform(data_matrix, )
pca_clusters = KMeans(n_clusters=20, random_state=100, init='k-means++').fit_predict(X_vis)
print("DONE")

DONE


In [60]:
trace = Scatter3d(
    x=X_vis[:,0],
    y=X_vis[:,1],
    z=X_vis[:,2],
    text=words_to_viz ,
    mode='markers',
    marker=dict(
        size=7,
        line=dict(
            color=pca_clusters,
            colorscale=  'Viridis',
            width=1
        ),
        opacity=0.8
    )
)


data = [trace]
layout = Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = Figure(data=data, layout=layout)
iplot(fig, filename='simple-3d-scatter')

In [22]:
sum(pca.explained_variance_ratio_)

0.9952563745900989

In [23]:
X_embedded = TSNE(n_components=3, random_state=6969, verbose=100, n_iter=250).fit_transform(X_vis)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.247s...
[t-SNE] Computed neighbors for 10000 samples in 97.914s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 3.756468
[t-SNE] Computed conditional probabilities in 0.474s
[t-SNE] Iteration 50: error = 169.6651611, gradient norm = 0.0465217 (50 iterations in 121.341s)
[t-SNE

In [29]:
import pandas as pd

In [26]:

clusters = KMeans(n_clusters=15, random_state=100).fit_predict(X_embedded)
print("DONE")

DONE


In [30]:
df = pd.DataFrame(X_embedded, columns=['comp1', 'comp2', 'comp3'])
df.head()

Unnamed: 0,comp1,comp2,comp3
0,-0.023919,0.052284,-0.023698
1,-0.017734,0.058412,-0.039554
2,-0.019105,0.05815,-0.026698
3,-0.019636,0.059746,-0.028716
4,-0.019245,0.059997,-0.028354


In [61]:
df['clusters'] = clusters
df['words'] = words_to_viz

In [62]:
small_df = df.loc[ np.abs(df['comp1']) <5]
small_df.shape

(9947, 5)

In [63]:
trace = Scatter3d(
    #x=X_embedded[:,0],
    #y=X_embedded[:,1],
    #z=X_embedded[:,2],
    #text=words_to_viz ,
    x=small_df.comp1,
    y = small_df.comp2,
    z = small_df.comp3,
    text = small_df.words,
    mode='markers',
    marker=dict(
        size=4,
        line=dict(
            color=clusters,
            colorscale='Viridis',
            width=0.5
        ),
        opacity=0.8
    )
)


data = [trace]
layout = Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = Figure(data=data, layout=layout)
iplot(fig, filename='simple-3d-scatter')