In [None]:
import gensim.downloader as api
import spacy
import pandas as pd
import texthero as hero
from collections import Counter
from settings import AMBIGUITY_PATH
from src.analysis.embedded import find_embedding
import seaborn as sns
import numpy as np
from sklearn.manifold import TSNE

pd.set_option('mode.chained_assignment', None)

In [None]:
word_embeddings = api.load('glove-twitter-200')

In [None]:
twitter_space = {"word": [], "embedding": [], "space": "twitter"}
for word in word_embeddings.vocab.keys():
    twitter_space["word"].append(word)
    twitter_space["embedding"].append(word_embeddings.get_vector(word))
twitter_space = pd.DataFrame(twitter_space)
twitter_space.head()

In [None]:
emojis = pd.read_csv(AMBIGUITY_PATH, encoding='utf-8')
print("Choosing subset of words...")
vocabularies = emojis[["emoji", "word"]] \
    .groupby("emoji").word \
    .apply(list).apply(Counter) \
    .reset_index() \
    .rename({"word": "vocabulary"}, axis=1).set_index("emoji")
ambiguity_vocab = {st for row in vocabularies.vocabulary for st in row}
print("Computing embeddings...")
tokenizer = spacy.load("en_core_web_sm")
emoji_subspace = {"word": [], "embedding": [], "space": "emoji"}
for emoji_description in ambiguity_vocab:
    tokens = {token.text for token in tokenizer(emoji_description)}
    vec = find_embedding(tokens, word_embeddings)
    if vec is not None:
        emoji_subspace["word"].append(emoji_description)
        emoji_subspace["embedding"].append(vec)
emoji_subspace = pd.DataFrame(emoji_subspace)
emoji_subspace.head()

In [None]:
whole_space = pd.concat((twitter_space, emoji_subspace))

In [None]:
del word_embeddings

In [None]:
whole_space = whole_space.drop_duplicates("word", keep='last')

In [None]:
whole_space['pca'] = hero.pca(whole_space['embedding'])
hero.scatterplot(
    whole_space, 
    col='pca', 
    color='space', 
    title=""
)

In [None]:
whole_space[['pca_x','pca_y']] = pd.DataFrame(whole_space.pca.tolist(), index=whole_space.index)

In [None]:
sns.jointplot(data=whole_space, x='pca_x', y='pca_y', hue='space', kind='hist', height=10);

In [None]:
X_ls = whole_space.embedding.tolist()

In [None]:
from tqdm.notebook import tqdm

In [None]:
X_np = []
for embedding in tqdm(X_ls):
      X_np.append(list(embedding))

In [None]:
del X_ls

In [None]:
X_np = np.array(X_np)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
reduced = pca.fit_transform(X_np)

In [None]:
reduced.shape

In [None]:
from MulticoreTSNE import MulticoreTSNE as TSNE

In [None]:
whole_space['tsne'] = TSNE(n_components=2, n_jobs=4).fit_transform(reduced)

In [None]:
del reduced

In [None]:
whole_space.head()