##Carico il corpus wikitext2 (preprocessing già effettuato)

In [41]:
import pandas as pd
import os 
import sys
import nltk
sys.path.append("../")

df = pd.read_csv("../corpus/WikiText2_36718.txt", header=None)
df = df.rename(columns={0: 'token'})
df

wpt = nltk.WordPunctTokenizer()
tokenized = [wpt.tokenize(document) for document in df['token']]


##Training

In [42]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(vector_size=300, sg=1, min_count=50)
w2v_model.build_vocab(tokenized)
w2v_model.train(tokenized, total_examples=w2v_model.corpus_count, epochs=5)


(3585523, 5190265)

##Analysis

In [43]:
w2v_model.wv.most_similar(positive=["rock"])

[('roll', 0.7728258967399597),
 ('pop', 0.744388997554779),
 ('metal', 0.7031517624855042),
 ('electric', 0.671383261680603),
 ('instruments', 0.670779287815094),
 ('hop', 0.6691061854362488),
 ('blues', 0.6637970805168152),
 ('mix', 0.6623297333717346),
 ('jazz', 0.6530014276504517),
 ('hip', 0.6504666209220886)]

##TSNE

In [44]:
# embedding from first model layer

import numpy as np

keyed = w2v_model.wv
print(len(keyed))
embeddings = []
for val, key in w2v_model.wv.key_to_index.items():
    embeddings.append(keyed.get_vector(key))


3942


In [39]:
from sklearn.manifold import TSNE

# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = w2v_model.wv.index_to_key

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [40]:
import numpy as np
import plotly.graph_objs as go

color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("../word2vec_visualization.html")