# Emojii embeddings

In [2]:
#!pip install emoji

In [2]:
import warnings
warnings.filterwarnings('ignore') # sorry for that, I hate this type of red
import pandas as pd
import emoji
from pathlib import Path

In [3]:
home = str(Path.home())
data = pd.read_csv(f"{home}/emoji_prediction/data/full_df_twi_tg.csv", dtype=str)

In [4]:
data_unique = data.drop_duplicates(subset=['texts', 'time', 'source'], keep='first')

In [5]:
# unique emojis unicode
emojis = list(emoji.UNICODE_EMOJI.keys())
#emojis = [str(e) for e in emojis]
unicodes = [e.encode('unicode-escape').decode("utf-8")  for e in emojis]

In [57]:
# preprocessing texts: stemming, Username|URL standartization, decoding emojis to utf
import re
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize

def getCleanData(df):
    '''
    so, we get read of RT sign, 
    standardize URLs and usernames
    '''
    df['texts'] = df['texts']\
    .apply(lambda x: re.sub(r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)", "@username", x))\
    .apply(lambda x: re.sub(r"([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]", 
                         "URL", x))\
    .str.replace("RT @username", "")
    return df

def tokenizeData(df):
    '''
    stemming, tokenization, 
    getting rid of unnecessary spaces
    '''
    m = Mystem()
    tokenized_texts = df['texts'].apply(lambda x: m.lemmatize(x))
    df['texts'] = df['texts'].apply(lambda x: " ".join(m.lemmatize(x)))
    df['texts'] = df['texts'].apply(lambda x: re.sub('\s+', ' ', x).strip())
    return df, tokenized_texts


In [58]:
%%time
data_clean = getCleanData(data_unique)
print(data_clean.shape)

(191764, 6)
CPU times: user 1.96 s, sys: 33.5 ms, total: 2 s
Wall time: 2.01 s


In [59]:
%%time
data_clean, tokenized_texts = tokenizeData(data_clean)

CPU times: user 34.6 s, sys: 5.89 s, total: 40.5 s
Wall time: 3min 24s


In [60]:
texts = data_clean['texts'].tolist()

In [61]:
texts[900:910]

['ахахах я сегодня гарь хотеть пересматривать весь 😁 ждать сториза в инстаграмма',
 'хорошо 😂 я хз что это но должно быть интересно )',
 'гари потер ну 😁',
 'да , я понимать ) пересматривать 😂 я давно не смотреть .. чуять , проигрывать мы 😂',
 '😂 😂 😂',
 'надо быть в скорый время сходить в барчик 😏',
 'ниховать весь шарить 😂',
 'у я с ты фотка в вк стоять , все я задавать вопрос это што твой паринь 🤣 🤣 🤣 🤣 🤣',
 '🤔 какой интересный интерпретация',
 '😉']

## Word2Vec: CBOW

In [18]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [72]:
model_CBOW = Word2Vec(texts, size=300, window=3, min_count=4, workers=4)

print(model_CBOW.corpus_total_words)
print(model_CBOW.wv.most_similar('😘', topn=10))

8335561
[('\U0001f970', 0.6670569777488708), ('😙', 0.643570601940155), ('😗', 0.6321011781692505), ('😍', 0.6316735148429871), ('🤗', 0.6129240393638611), ('😚', 0.5800038576126099), ('☻', 0.5588058233261108), ('👑', 0.5584381222724915), ('💖', 0.5571088194847107), ('💐', 0.5425295233726501)]


In [68]:
model_CBOW.save("./models/word2vec_CBOW_new.model")

In [19]:
# read existing models
model_CBOW = Word2Vec.load("./models/word2vec_CBOW_new.model")

In [75]:
similars = [{e: model_CBOW.wv.most_similar(e)} for e in emojis if e in model_CBOW.wv.vocab]

In [71]:
import json
with open('./similar_emojis_w2v_CBOW_new.json', 'w') as outfile:
    json.dump(similars, outfile)
    outfile.close()

## Word2Vec: Skip-gram

In [76]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

model_skip = Word2Vec(texts, size=300, window=5, min_count=4, workers=4, sg=1)
print(model_skip.corpus_total_words)
print(model_skip.wv.most_similar('😘'))

8335561
[('😗', 0.5187710523605347), ('😽', 0.5164458751678467), ('💘', 0.5115646123886108), ('😙', 0.5092437267303467), ('🤗', 0.5070638060569763), ('🌠', 0.49633532762527466), ('😚', 0.48668473958969116), ('💖', 0.4852866232395172), ('🙋', 0.4819706082344055), ('🐧', 0.47724226117134094)]


In [75]:
model_skip.save("./models/word2vec_SkipGram_new.model")

In [76]:
similars_skip = [{e: model_skip.wv.most_similar(e)} for e in emojis if e in model_skip.wv.vocab]

In [77]:
for e in range(0,len(similars_skip)):
    e_similar = []
    index = list(similars_skip[e].keys())[0]
    for i in similars_skip[e][index]:
        if i[0] in emojis:
            e_similar.append(i)

    similars_skip.append({index: e_similar})

In [78]:
import json
with open('./similar_emojis_w2v_SkipGram_new.json', 'w') as outfile:
    json.dump(similars_skip, outfile)
    outfile.close()

## Clustering emojis: KMeans, Affinity propagation, DBSCAN

### Spoiler: KMeans wins

In [100]:
emojis_found = [e for e in emojis if e in model_CBOW.wv.vocab]

In [101]:
X = [model_CBOW.wv[e] for e in emojis if e in model_CBOW.wv.vocab]

In [102]:
len(X)

813

In [106]:
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans

for n_cluster in range(2, 19):
    kmeans = KMeans(n_clusters=n_cluster).fit(X)
    label = kmeans.labels_
    sil_coeff = silhouette_score(X, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

For n_clusters=2, The Silhouette Coefficient is 0.4953552484512329
For n_clusters=3, The Silhouette Coefficient is 0.48306041955947876
For n_clusters=4, The Silhouette Coefficient is 0.4721744656562805
For n_clusters=5, The Silhouette Coefficient is 0.44254857301712036
For n_clusters=6, The Silhouette Coefficient is 0.44762417674064636
For n_clusters=7, The Silhouette Coefficient is 0.4482634663581848
For n_clusters=8, The Silhouette Coefficient is 0.34602969884872437
For n_clusters=9, The Silhouette Coefficient is 0.36182931065559387
For n_clusters=10, The Silhouette Coefficient is 0.4611460566520691
For n_clusters=11, The Silhouette Coefficient is 0.434974730014801
For n_clusters=12, The Silhouette Coefficient is 0.35358086228370667
For n_clusters=13, The Silhouette Coefficient is 0.3629339933395386
For n_clusters=14, The Silhouette Coefficient is 0.3535158038139343
For n_clusters=15, The Silhouette Coefficient is 0.3471609055995941
For n_clusters=16, The Silhouette Coefficient is 0.

In [107]:
from sklearn.cluster import KMeans
import numpy as np

k_means = KMeans(n_clusters=14, random_state=1)
k_means.fit(X)
k_means_labels = k_means.labels_ # array with cluster of emojis (concat to df)


k_means_labels_unique = np.unique(k_means_labels)
k_means_cluster_centers = k_means.cluster_centers_

In [108]:
emo_clusters = pd.DataFrame(k_means_labels, index=emojis_found, columns=['cluster_group']).reset_index()

In [40]:
emo_clusters.groupby('cluster_group').agg('count')

Unnamed: 0_level_0,index
cluster_group,Unnamed: 1_level_1
0,5
1,567
2,30
3,34
4,2
5,31
6,38
7,2
8,34
9,19


In [None]:
# saving clusters
emo_clusters.to_csv("./emo_clusters14.csv", index=False)

In [11]:
# read saved clusters 
emo_clusters = pd.read_csv("./emo_clusters14.csv")

In [12]:
tsne_emojis = emo_clusters[(emo_clusters.cluster_group != 1)]

In [71]:
tsne_emojis.to_csv("./tsne_emojis.csv", index=False)

In [67]:
emo_clusters[emo_clusters.cluster_group == 11]

Unnamed: 0,index,cluster_group
113,💔,11
166,😖,11
187,😿,11
188,😢,11
210,😞,11
225,😓,11
420,😭,11
514,😔,11
518,😣,11
614,😥,11


### Top emojis by frequencies in clusters

In [57]:
emo_freqs = data[['emoji', 'names']].groupby('emoji').agg('count')

emo_freqs

In [59]:
freqs_clusters = pd.merge(emo_freqs, emo_clusters, left_on='emoji', right_on='index')
top_freqs_clusters = freqs_clusters.sort_values(by='names', ascending=False).groupby('cluster_group').head(5)

In [61]:
freqs_clusters.to_csv("./freqs_clusters.csv", index=False)

In [64]:
top_freqs_clusters.to_csv("./top_freqs_clusters.csv", index=False)

In [70]:
top_freqs_clusters[top_freqs_clusters.cluster_group == 2]

Unnamed: 0,names,index,cluster_group
629,3591,😱,2
613,3484,😡,2
616,2412,😤,2
612,1811,😠,2
425,1755,👻,2


## Trying Affinity prop and Dbscan for noisy emojis (not succeeding)

### Affinity propagation

In [156]:
clust_1 = emo_clusters['index'][emo_clusters.cluster_group == 1].tolist()

clust_1_embed = {}
for e in clust_1:
    if e in model_CBOW.wv.vocab:
        clust_1_embed[e] = model_CBOW.wv[e]
        
X_clust_1 = list(clust_1_embed.values())

In [158]:
from sklearn.cluster import AffinityPropagation

aff = AffinityPropagation(damping=0.9, max_iter=200, convergence_iter=15, copy=True, 
                                    preference=None, affinity="euclidean", verbose=False)
aff_matr = aff.fit_predict(X_clust_1)
affinity_labels = aff.labels_

from sklearn.metrics import silhouette_score
mean_sil_aff = silhouette_score(X_clust_1, affinity_labels)

print("Mean silhouette score for Affinity propagation: " + str(mean_sil_aff)) # well, emm, hah

Mean silhouette score for Affinity propagation: 0.059873402


### DBSCAN

In [159]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs

db = DBSCAN(eps=0.3, min_samples=10).fit(X_clust_1)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_) # father forgive me for i have sinned

Estimated number of clusters: 1
Estimated number of noise points: 442
