In [1]:
import pandas as pd
import numpy as np
import emoji
from pandarallel import pandarallel
from src.visualization.emoji_plotting import plot_emoji_barplot

pandarallel.initialize(nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
path = "/scratch/czestoch/bert_emojis.csv.gz"
bert = pd.read_csv(path)
len(bert)

  interactivity=interactivity, compiler=compiler, result=result)


106446133

In [8]:
bert = bert.dropna()
len(bert)

182151

In [None]:
# bert.embedding = bert.embedding.parallel_apply(eval)

In [34]:
bert.groupby("emoji").embedding.filter(lambda x: x.count() < 30)

Series([], Name: embedding, dtype: object)

In [35]:
bert.emoji.value_counts(ascending=True)[:10]

:smiling_face_with_3_hearts:    30
:person_getting_haircut:        50
:pouting_cat_face:              52
:person_frowning:               52
:person_shrugging:              56
:slightly_frowning_face:        60
:person_getting_massage:        63
:crying_cat_face:               63
:vulcan_salute:                 64
:anguished_face:                68
Name: emoji, dtype: int64

In [19]:
def calculate_variance(group):
    matrix = np.array(group.embedding.tolist())
    return np.sum(matrix.var(1))

In [76]:
variances = bert.groupby("emoji").parallel_apply(calculate_variance).reset_index().rename({0:"variance"}, axis=1)
variances.emoji = variances.emoji.parallel_apply(emoji.emojize)

In [77]:
variances.head()

Unnamed: 0,emoji,variance
0,🆒,17.716984
1,🎄,42.04727
2,🆓,27.894447
3,🆗,25.468911
4,👌,81.439773


In [79]:
variances.sort_values("variance")[:10]

Unnamed: 0,emoji,variance
355,🥰,3.70524
290,💇,7.020612
312,😾,7.787295
350,🙁,8.144859
287,🙍,8.259674
91,😿,8.407997
17,😧,9.147752
264,👃,9.292639
295,🤷,9.304649
291,💆,9.526702


In [80]:
# from src.data.utils import save_to_csv

In [81]:
# save_to_csv(variances, "/scratch/czestoch/bert_variances.csv")

In [None]:
###################

In [3]:
path = "/scratch/czestoch/bert_variances.csv.gz"
variances = pd.read_csv(path)

In [4]:
variances.head()

Unnamed: 0,emoji,variance
0,🆒,17.716984
1,🎄,42.04727
2,🆓,27.894447
3,🆗,25.468911
4,👌,81.439773


In [5]:
from tqdm.notebook import tqdm

In [9]:
baselines = []
for i in tqdm(range(1000)):
    ls = bert.sample(n=30, random_state=42+i).embedding.tolist()
    matrix = np.array([eval(l) for l in ls])
    variance = np.sum(matrix.var(1))
    baselines.append(variance)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [10]:
alpha = 5    
baseline = np.nanmedian(np.array(baselines))
baselines_CIs = [np.nanpercentile(baselines, alpha / 2),\
                         np.nanpercentile(baselines, 100 - (alpha / 2))] 

In [12]:
import matplotlib.pyplot as plt

In [15]:
%load_ext autoreload
%autoreload 2

In [18]:
top_ten = variances.nlargest(10, columns='variance').reset_index()

fig, ax = plt.subplots()
ax.plot(range(-2, 12), 14*[baseline], '-.', alpha=0.8, color='black')
ax.fill_between(range(-2, 12), baselines_CIs[0],
                 baselines_CIs[1], alpha = 0.3, color='gray')
plot_emoji_barplot(top_ten, ax, col='variance')