In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

In [41]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 

In [45]:
data = pd.read_csv('cleaned_tweets.csv').dropna(subset=['clean'])

In [46]:
from sklearn.model_selection import train_test_split
X = data['clean']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=112221, stratify=y)

In [53]:
from gensim.models import Word2Vec
from nltk import word_tokenize

X_train_token = X_train.map(word_tokenize)

In [54]:
word2vec_model = Word2Vec(X_train_token, window=5, min_count=1, workers=4)

word2vec_model.train(X_train_token, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

(213832, 247340)

In [56]:
# w2v = Word2Vec(X_train_token, size=150, window=10, min_count=10, workers=10)            

# w2v.train(X_train_token, total_examples=len(X_train_token), epochs=10)

(278615, 494680)

In [60]:
import warnings
warnings.filterwarnings('ignore')

labels = []
tokens = []
for word in word2vec_model.wv.vocab:
    tokens.append(word2vec_model[word])
    labels.append(word)

In [62]:
from sklearn.manifold import TSNE
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

In [64]:
axis_dict = {'autorange': True, 'showgrid': False, 'zeroline': False, 'showline': False, 'ticks': '', 'showticklabels': False}
init_notebook_mode(connected=True)
plot_data = go.Scatter(x=x, y=y, mode='markers+text', marker = {'size': 4, 'color': '#2abdbd'},
                       text=labels, textposition='top center', textfont={'family': 'calibri', 'size': 12, 'color': '#4f6978'})

layout = go.Layout(xaxis=axis_dict, yaxis=axis_dict)
fig = go.Figure(data=[plot_data], layout=layout)
iplot(fig)