In [1]:
import gensim.downloader as api
import spacy
import pandas as pd
import texthero as hero
from collections import Counter
import seaborn as sns
import numpy as np
import os
import matplotlib.pyplot as plt
from p_tqdm import p_map
from tqdm.notebook import tqdm
from itertools import chain
import pickle

pd.set_option('mode.chained_assignment', None)
plt.rcParams.update({'font.size': 15})

In [2]:
import sys
sys.path.append("..")
# os.environ["PYTHONPATH"] = "/home/czestoch/workspace/emoji-ambiguity/src"
from src.analysis.embedded import find_embedding
from settings import AMBIGUITY_CLUSTER, AMBIGUITY_PATH, EMOJI_CATEGORIZED

In [7]:
# api.info()['models'].keys()

In [6]:
# api.info()['models']['glove-twitter-200']

In [3]:
### Creating twitter space
# word_embeddings = api.load('glove-twitter-200')
### Creating google news space
word_embeddings = api.load('word2vec-google-news-300')
news_space = {"word": [], "embedding": [], "space": "news"}
for word in word_embeddings.vocab.keys():
    news_space["word"].append(word)
    news_space["embedding"].append(word_embeddings.get_vector(word))
news_space = pd.DataFrame(news_space)
news_space.head()

Unnamed: 0,word,embedding,space
0,</s>,"[0.0011291504, -0.00089645386, 0.00031852722, ...",news
1,in,"[0.0703125, 0.08691406, 0.087890625, 0.0625, 0...",news
2,for,"[-0.011779785, -0.04736328, 0.044677734, 0.063...",news
3,that,"[-0.01574707, -0.028320312, 0.083496094, 0.050...",news
4,is,"[0.0070495605, -0.07324219, 0.171875, 0.022583...",news


In [4]:
### Creating emoji space
emojis = pd.read_csv(AMBIGUITY_PATH, encoding='utf-8')
print("Choosing subset of words...")
vocabularies = emojis[["emoji", "word"]] \
    .groupby("emoji").word \
    .apply(list).apply(Counter) \
    .reset_index() \
    .rename({"word": "vocabulary"}, axis=1).set_index("emoji")
ambiguity_vocab = {st for row in vocabularies.vocabulary for st in row}
print("Computing embeddings...")
tokenizer = spacy.load("en_core_web_sm")
emoji_subspace = {"word": [], "embedding": [], "space": "emoji"}
for emoji_description in ambiguity_vocab:
    tokens = {token.text for token in tokenizer(emoji_description)}
    vec = find_embedding(tokens, word_embeddings)
    if vec is not None:
        emoji_subspace["word"].append(emoji_description)
        emoji_subspace["embedding"].append(vec)
emoji_subspace = pd.DataFrame(emoji_subspace)
emoji_subspace.head()

Choosing subset of words...
Computing embeddings...


Unnamed: 0,word,embedding,space
0,crazy,"[0.030639648, -0.019165039, 0.03881836, 0.2226...",emoji
1,cornfield,"[0.41992188, 0.047851562, 0.053955078, 0.02868...",emoji
2,notebook,"[-0.017089844, 0.15429688, -0.07910156, -0.012...",emoji
3,buds,"[0.008972168, 0.36328125, -0.09716797, 0.05029...",emoji
4,microscope,"[-0.087890625, 0.24707031, 0.12011719, -0.1669...",emoji


In [5]:
del word_embeddings
del emojis
del ambiguity_vocab
del vocabularies
del tokens
del vec

### Look at projection of emoji space only

In [None]:
emoji_subspace['pca'] = hero.pca(emoji_subspace['embedding'])

In [None]:
with open(EMOJI_CATEGORIZED, "rb") as f:
    emojis_categorized = pickle.load(f)

categorized = {c: set(e) for c, e in emojis_categorized.items()}
emojis = emojis[["emoji", "word"]]
emoji_subspace.word = emoji_subspace.word.astype(str)
emojis.word = ambiguity.word.astype(str)
emojis = emojis.drop_duplicates("word")
emoji_subspace1 = pd.merge(emoji_subspace, emojis, how='left', left_on='word', right_on='word')
emoji_subspace1["category"] = "twitter"


def find_categories(df):
    indices = []
    for idx, row in tqdm(df.iterrows()):
        for category in categorized:
            if row.emoji in categorized[category]:
                indices.append((idx, category))
    return indices

n_cores = 8
df_split = np.array_split(emoji_subspace1, n_cores)
out = list(chain.from_iterable(p_map(find_categories, df_split, num_cpus=n_cores))
unzipped = list(zip(*out))
indices = unzipped[0]
categories = unzipped[1]

emoji_subspace1.loc[indices, "category"] = categories
emoji_subspace1[['pca_x','pca_y']] = pd.DataFrame(emoji_subspace1.pca.tolist(), index=emoji_subspace1.index)

In [None]:
df = emoji_subspace1[emoji_subspace1.category != 'flags']
sns.jointplot(data=df, x='pca_x', y='pca_y', hue='category', kind='hist', height=10);

### Find projection of the whole space twitter + emojis

In [6]:
whole_space = pd.concat((news_space, emoji_subspace))

In [7]:
del news_space
del emoji_subspace

In [None]:
# whole_space = whole_space[~whole_space.word.isin(['u', 'b', 'r', 'n', 'm'])]

In [None]:
whole_space['pca'] = hero.pca(whole_space['embedding'])
whole_space[['pca_x','pca_y']] = pd.DataFrame(whole_space.pca.tolist(), index=whole_space.index)

In [None]:
sns.jointplot(data=whole_space, x='pca_x', y='pca_y', hue='space', kind='hist', height=10);

In [None]:
hero.scatterplot(
    whole_space, 
    col='pca', 
    color='space',
    hover_data=["word"],
    title=""
)

### Look at the whole space with category division

In [None]:
from settings import EMOJI_CATEGORIZED

with open(EMOJI_CATEGORIZED, "rb") as f:
    emojis_categorized = pickle.load(f)
    
ambiguity = pd.read_csv("/scratch/czestoch/ambiguity_dataset.csv.gz")
ambiguity = ambiguity[["emoji", "word"]]

categorized = {c:set(e) for c,e in emojis_categorized.items()}
emojis = emojis[["emoji", "word"]]

whole_space.word = whole_space.word.astype(str)
emojis.word = ambiguity.word.astype(str)
emojis = emojis.drop_duplicates("word")

whole_space1 = pd.merge(whole_space, emojis, how='left', left_on='word', right_on='word')
whole_space1["category"] = "twitter"

emoji_space = whole_space1[whole_space1.space == 'emoji']

def find_categories(df):
    indices = []
    for idx, row in tqdm(df.iterrows()):
        for category in categorized:
            if row.emoji in categorized[category]:
                indices.append((idx, category))
    return indices

n_cores = 8
df_split = np.array_split(emoji_space, n_cores)
out = p_map(find_categories, df_split, num_cpus=n_cores)

from itertools import chain

to_set = list(chain.from_iterable(out))

unzipped = list(zip(*to_set))
indices = unzipped[0]
categories = unzipped[1]

whole_space1.loc[indices, "category"] = categories

In [None]:
whole_space1[whole_space1.space == 'twitter'].category.unique()

In [None]:
len(whole_space)

In [None]:
whole_space1.head()

In [None]:
whole_space1.category.unique()

In [None]:
categories = ['people', 'symbols', 'travel-places', 'objects',
               'activity', 'nature', 'food-drink']

In [None]:
import matplotlib.pyplot as plt

In [None]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.light_palette(color, as_cmap=True)
    plt.hexbin(x, y, gridsize=15, cmap=cmap, **kwargs)

# with sns.axes_style("dark"):
df = whole_space1[(whole_space1.space =='emoji') & (whole_space1.category != 'flags')]
g = sns.FacetGrid(df, hue="category", col="category", col_wrap=3, height=4)
g.map(hexbin, "pca_x", "pca_y");

In [None]:
df = whole_space1[(whole_space1.space =='emoji') & (whole_space1.category != 'flags')]
sns.jointplot(data=df, x='pca_x', y='pca_y', hue='category', kind='hist', height=10);

In [None]:
sns.jointplot(data=whole_space1, x='pca_x', y='pca_y', hue='category', kind='hist', height=10);

### Analyze the pca output

## t-SNE

In [None]:
import pickle
import seaborn as sns

In [None]:
with open("/scratch/czestoch/tsne.pkl", "rb") as f:
    whole_space = pickle.load(f)

In [None]:
whole_space.head()

In [None]:
len(whole_space)

In [None]:
sns.jointplot(data=whole_space, x='tsne_x', y='tsne_y', hue='space', kind='hist', height=10);

In [None]:
whole_space = whole_space[['word', 'space', 'tsne_x', 'tsne_y']]

In [None]:
from settings import EMOJI_CATEGORIZED

with open(EMOJI_CATEGORIZED, "rb") as f:
    emojis_categorized = pickle.load(f)

In [None]:
categorized = {c:set(e) for c,e in emojis_categorized.items()}

In [None]:
import pandas as pd

In [None]:
ambiguity = pd.read_csv("/scratch/czestoch/ambiguity_dataset.csv.gz")

In [None]:
ambiguity = ambiguity[["emoji", "word"]]
ambiguity.head()

In [None]:
from settings import EMOJI_CATEGORIZED

with open(EMOJI_CATEGORIZED, "rb") as f:
    emojis_categorized = pickle.load(f)
    
ambiguity = pd.read_csv("/scratch/czestoch/ambiguity_dataset.csv.gz")
ambiguity = ambiguity[["emoji", "word"]]

categorized = {c:set(e) for c,e in emojis_categorized.items()}
emojis = emojis[["emoji", "word"]]

whole_space.word = whole_space.word.astype(str)
emojis.word = ambiguity.word.astype(str)
emojis = emojis.drop_duplicates("word")

whole_space1 = pd.merge(whole_space, emojis, how='left', left_on='word', right_on='word')
whole_space1["category"] = "twitter"

emoji_space = whole_space1[whole_space1.space == 'emoji']

def find_categories(df):
    indices = []
    for idx, row in tqdm(df.iterrows()):
        for category in categorized:
            if row.emoji in categorized[category]:
                indices.append((idx, category))
    return indices

n_cores = 8
df_split = np.array_split(emoji_space, n_cores)
out = p_map(find_categories, df_split, num_cpus=n_cores)

from itertools import chain

to_set = list(chain.from_iterable(out))

unzipped = list(zip(*to_set))
indices = unzipped[0]
categories = unzipped[1]

whole_space1.loc[indices, "category"] = categories

In [None]:
sns.jointplot(data=whole_space, x='tsne_x', y='tsne_y', hue='category', kind='hist', height=10);