In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from tqdm.notebook import tqdm

from src.visualization.emoji_plotting import plot_emoji_barplot, emoji_scatter, offset_image
from src.analysis.variation import calculate_vocabulary_variation, embedded_CIs, read_embeddings
from settings import AMBIGUITY_PATH, AMBIGUITY_VARIATION, EMBEDDINGS_PATH

pd.set_option('mode.chained_assignment', None)

sns.set(rc={'figure.figsize':(15, 7)}, font_scale=1.7)
sns.set_style("white", {'axes.grid' : False})

In [2]:
variations = pd.read_csv(AMBIGUITY_VARIATION)
variations = variations.drop("mode_embedding", axis=1)
variations.CIs = variations.CIs.apply(eval)

In [3]:
flags = [ '🏳️', "🏳️‍🌈", "🏴‍☠️" , "🏴󠁧󠁢󠁥󠁮󠁧󠁿" , "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "🏴󠁧󠁢󠁷󠁬󠁳󠁿" ]
families = ["👨‍👦","👨‍👦‍👦","👨‍👧","👨‍👧‍👦","👨‍👧‍👧","👨‍👨‍👦","👨‍👨‍👦‍👦","👨‍👨‍👧","👨‍👨‍👧‍👦",\
"👨‍👨‍👧‍👧","👨‍👩‍👦","👨‍👩‍👦‍👦","👨‍👩‍👧","👨‍👩‍👧‍👦","👨‍👩‍👧‍👧","👩‍👦","👩‍👦‍👦","👩‍👧",\
"👩‍👧‍👦","👩‍👧‍👧","👩‍👩‍👦","👩‍👩‍👦‍👦","👩‍👩‍👧","👩‍👩‍👧‍👦","👩‍👩‍👧‍👧", "👨‍👩‍👧", "👨‍👩‍👧‍👦", "👨‍👩‍👧‍👧", "👩‍👩‍👧‍👦"]
variations = variations[~variations.emoji.isin(flags)]
variations = variations[~variations.emoji.isin(["🅰️", "🅱️", "🅾️", "🅿️", "🔟"])]
variations = variations[~variations.emoji.isin(families)]

In [4]:
variations.head()

Unnamed: 0,emoji,vocabulary,variation,CIs
0,#️⃣,"Counter({'pound': 10, 'number': 9, 'hashtag': ...",0.496355,"[0.35837585716197884, 0.6267261994754275]"
1,*️⃣,"Counter({'asterisk': 11, 'star': 9, 'pound': 2...",0.584016,"[0.41200086324362406, 0.7542600946038873]"
2,©️,"Counter({'copyright': 17, 'copywrite': 3, 'c':...",0.276658,"[0.1401009645106064, 0.4201737316739228]"
3,®️,"Counter({'r': 7, 'registered': 3, 'copyright':...",0.576165,"[0.4139707032711831, 0.7217792813700658]"
4,‼️,"Counter({'exclamation': 11, 'important': 3, 's...",0.494922,"[0.3517648566963858, 0.6258342967978838]"


In [5]:
pd.set_option('display.max_colwidth', None)
df = variations[variations.emoji.isin(['🛃', '❤️', '😂', '➿', '🈺', '👩‍👩‍👦', '👷', '💯', '😻'])][["emoji", "vocabulary"]]
df

Unnamed: 0,emoji,vocabulary
149,❤️,"Counter({'love': 16, 'heart': 10})"
155,➿,"Counter({'loops': 4, 'sign': 3, 'glasses': 2, 'astrology': 2, 'symbol': 1, 'libra': 1, 'wire': 1, 'curl': 1, 'curlicue': 1, 'coils': 1, 'loopy': 1, 'pig nose': 1, 'swirl': 1, 'forever': 1, 'curls': 1, 'zodiac': 1, 'wave': 1, 'cord': 1, 'spiral': 1, 'swirly': 1, 'loop': 1, 'twirling': 1, 'repeat': 1})"
198,🈺,"Counter({'symbol': 7, 'robot': 4, 'chinese': 4, 'connection': 1, 'logo': 1, 'google voice': 1, 'phone': 1, 'character': 1, 'computer': 1, 'television': 1, 'g': 1, 'korea': 1, 'unknown': 1, 'japanese': 1, 'dunno': 1, 'mahjong': 1, 'not sure': 1, 'to': 1})"
642,👷,"Counter({'construction': 11, 'worker': 11, 'plumber': 1, 'engineer': 1, 'man': 1, 'fire service': 1, 'work': 1})"
698,💯,"Counter({'hundred': 12, 'perfect': 4, 'agreed': 1, 'winner': 1, 'approval': 1, 'anniversary': 1, 'complete': 1, 'great': 1, 'percent': 1, 'agree': 1, 'hundred pere cent': 1, 'a': 1, 'great job': 1, 'one hundred': 1, 'sure': 1, 'numbers': 1})"
892,😂,"Counter({'laughing': 6, 'laugh': 5, 'crying': 5, 'funny': 3, 'lol': 2, 'cry': 2, 'laugher': 1, 'laughingsohardimcrying': 1, 'joy': 1, 'happy sad': 1, 'liao': 1})"
949,😻,"Counter({'love': 25, 'inlove': 2, 'happy': 1, 'attraction': 1, 'cat': 1})"
1036,🛃,"Counter({'mailman': 3, 'police': 3, 'conductor': 2, 'cop': 2, 'bus': 2, 'man': 1, 'signal': 1, 'trash man': 1, 'computer': 1, 'inspection': 1, 'police sign': 1, 'worker': 1, 'postal': 1, 'guard': 1, 'monitor': 1, 'symbol': 1, 'officer': 1, 'postman': 1, 'fare': 1, 'driver': 1, 'recycle': 1})"


In [6]:
variations = variations.set_index("emoji")

### Create randomized baseline

In [7]:
ambiguity = pd.read_csv(AMBIGUITY_PATH, encoding='utf-8')
word_embeddings = read_embeddings(EMBEDDINGS_PATH)
words = ambiguity.word

In [8]:
baselines_variation, baselines_vocab_size = [], []
for i in tqdm(range(1000)):
    random_words = words.sample(n=30, random_state=42+i).tolist()
    random_voc = Counter(random_words)
    baselines_vocab_size.append(len(set(random_words)))
    baselines_variation.append(calculate_vocabulary_variation(random_voc, word_embeddings)[0])

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [9]:
alpha = 5
baseline_variation = np.nanmedian(np.array(baselines_variation))
baselines_var_CIs = [np.nanpercentile(baselines_variation, alpha / 2),\
                         np.nanpercentile(baselines_variation, 100 - (alpha / 2))] 

In [10]:
baseline_vocab = np.nanmedian(np.array(baselines_vocab_size))
baselines_vocab_size_CIs = [np.nanpercentile(baselines_vocab_size, alpha / 2),\
                         np.nanpercentile(baselines_vocab_size, 100 - (alpha / 2))] 

In [11]:
baseline_variation, baselines_var_CIs

(0.6860900973280272, [0.5707602351453775, 0.8834659105717826])

In [12]:
baseline_vocab, baselines_vocab_size_CIs

(30.0, [27.0, 30.0])

In [13]:
len(variations)

1289

In [14]:
len(variations[variations.variation >= baselines_var_CIs[0]])

55

#### Which emojis are only a bit more ambiguous than baseline?

In [16]:
variations[variations.variation.sub(baseline_variation).gt(0.01)]["variation"]

emoji
💁    0.713159
💌    0.700732
📵    0.702734
🛂    0.716157
🛃    0.745275
🧻    0.698550
Name: variation, dtype: float64

#### Which emojis are very close to baseline? Both sides, a bit lower than baseline, a bit higher than baseline

In [17]:
variations[variations.variation.sub(baseline_variation).abs().lt(0.01)]["variation"]

emoji
⛎      0.692086
➿      0.692479
👩‍💼    0.689072
🧐      0.694345
Name: variation, dtype: float64