In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.realpath("")))

In [2]:
from os.path import join

from gensim.models import KeyedVectors
# import fasttext as ft
import pandas as pd
import numpy as np
# import umap
# import umap.plot
import hdbscan
from sklearn.cluster import DBSCAN

import utilities

pd.set_option('display.max_rows', 500)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
lexicon = utilities.get_arabic_lexicon_data("..")

In [4]:
emotions = list(lexicon.keys())
emotions

['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

In [5]:
glf_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "glf.vec"))
egy_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "egy.vec"))
irq_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "irq.vec"))
nor_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "nor.vec"))
lev_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "lev.vec"))

In [6]:
def generate_emotion_lexicon(embedding, emotion, min_cluster_size, num_words_to_generate=100, verbose=False, save=False, embedding_name=None):
    if save:
        assert embedding_name is not None, "embedding_name must not be None when trying to save" 
    
    lexicon = utilities.get_arabic_lexicon_data("..")
    def get(embedding, val, default=None):
        try:
            return embedding[val]
        except KeyError:
            return default

    emotions = list(lexicon.keys())
    if verbose:
        print(f"Emotions in lexicon {emotions}")
    assert emotion in lexicon, f"The emotion {emotion} is not in the Lexicon"

    vecs_non_filtered = list(map(lambda wrd: get(embedding, wrd, default=None), lexicon[emotion]))
    if verbose:
        print(f"Number of words considered {sum(1 for vec in vecs_non_filtered if vec is not None)}/{len(vecs_non_filtered)}")
    vecs = list(filter(lambda x: x is not None, vecs_non_filtered))
    centroid = np.mean(vecs, axis=0)

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    cluster_labels = clusterer.fit_predict(vecs)

    if verbose:
        print(cluster_labels)
    cluster_to_words = {}
    for focus_cluster in np.unique(cluster_labels):
        clustered_vecs = [vec for cluster, vec in zip(cluster_labels, vecs) if cluster == focus_cluster]
        clustered_centroid = np.median(clustered_vecs, axis=0)
        cluster_to_words[focus_cluster] = list(zip(*embedding.similar_by_vector(clustered_centroid, num_words_to_generate)))[0]

    if verbose:
        mapper = umap.UMAP().fit(vecs)
        umap.plot.points(mapper, labels=cluster_labels)
    df = pd.DataFrame(cluster_to_words)
    style = df.style.applymap(lambda x: "background-color: rgba(20, 20, 20, 1)" if x in lexicon[emotion] else "")
    if save:
        df.to_csv(f"emotion_lexicon/{embedding_name}_{emotion}_minclustersize={min_cluster_size}.csv")
    return style

## Problems

### Problem with using all words in generating similar words
### Solution: cluster the vector representation of words

In [7]:
df = generate_emotion_lexicon(embedding=glf_embedding, emotion="anger", min_cluster_size=100, num_words_to_generate=500)
df.data.iloc[:20]

Unnamed: 0,-1
0,لطخ
1,خبائث
2,طغا
3,بذيء
4,وماخفى
5,ذآ
6,اظمى
7,تلذذ
8,أشمط
9,جحود


### Problem with centroid idea
### Solution(?): Maybe decrease the min_cluster_size?
![img](https://www.researchgate.net/profile/Fotios-Katsilieris-2/publication/239926467/figure/fig4/AS:669426328301595@1536615082633/An-example-of-the-difference-between-a-convex-and-a-non-convex-region.ppm)


# Notes

### finding the centroid with the mean seems to give worse performance than finding it with the median. Probably since the mean is swayed by outlier vectors 

# Glf

In [8]:
df = generate_emotion_lexicon(embedding=glf_embedding, emotion="sadness", min_cluster_size=8, num_words_to_generate=100)
df

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8
0,طغا,حزن,ندم,حزين,نادم,كئيب,محزن,أسى,بؤس,بائس
1,لطخ,نحزن,اندم,احزن,نادره,ثلج,نحزن,قسى,لدغ,ذقن
2,خبائث,أحزن,ندمان,مهموم,نادى,مزدحم,أحزن,أتناسى,ذآ,باض
3,أيقظ,احزن,أخسر,حز,ادم,مثلج,محزون,ابنسى,يغث,بالى
4,يؤلمك,لاتحزن,تتعثر,عآد,نادية,يغث,مح,أبنسى,غطا,كأنما
5,ذآ,أبكى,رآضي,حزن,صادم,كرهوني,احزن,يقسى,خبائث,أيقظ
6,ى,حزين,مكسور,أنين,ناصب,ماونتنز,حزن,تقسى,أنهب,أشمط
7,حينئذ,عزف,تيأسي,غمضة,ناصع,يتمشون,محك,أقسى,غث,باعث
8,أشمط,قلط,ومك,اغصان,ناجى,غثيث,تذلل,نسى,نكح,طوبى
9,تلذذ,محزن,وسوسة,رآح,هادم,اففففف,محبب,أضلعي,عابث,إكثار


In [None]:
df = generate_emotion_lexicon(embedding=glf_embedding, emotion="joy", min_cluster_size=7, num_words_to_generate=100)
df

# Egy

In [9]:
df = generate_emotion_lexicon(embedding=egy_embedding, emotion="sadness", min_cluster_size=5, num_words_to_generate=100)
df

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,بآ,ندم,اسف,حزين,حزن,أسف,غم,بؤس,نادم,بائس,آسف,أسى,كئيب,محزن,محبط,كدر,تعيس,متشائم,انقباض,مفجوع
1,ئ,أفندم,ااسف,حز,احزن,اتأسف,تغم,بؤقك,نادا,متفائل,متأسف,أسطى,كئيبه,احزن,محبب,حدر,تعس,تشائم,انقر,مجوع
2,رثا,افندم,نتاسف,رزين,نحزن,نتأسف,غمز,بؤ,بنادمين,باهت,اتأسف,أس,حبعمرى,نحزن,محبت,ئادر,تعك,متشالة,انقد,هجوع
3,لؤلؤ,اندم,تاسف,حزنى,حزنى,متأسف,يغم,بؤك,هادم,رائس,تتآسف,قسى,احبعمرى,محزء,محبنيش,أدر,تعكم,متشابه,انقي,اجوع
4,آ,يفندم,سف,اهيء,حز,تتأسف,غمضة,بؤي,ادم,تفائل,سف,أسقى,رياكشن,حزن,محبوب,أجدر,تع,متشال,انذر,جوع
5,ᴉ,يافندم,اسفخس,مزين,محزن,آسف,قلوبكو,رثا,ناعس,رثا,تتأسف,أسك,بؤقك,مح,محبه,هدر,تعطش,سئم,اننن,مالجوع
6,ثح,عندم,أسف,يازين,حزنان,أسفر,غمل,كمباوند,عادم,مجربتش,نتأسف,عسى,واحشتنى,لاتحزن,محبش,شادر,تعيين,متشتكيش,انقضى,هتجوع
7,فينقذك,ياافندم,اسفي,حزن,ياحزن,سف,غمة,رعشه,بببببف,مكتئب,ااسف,أسرى,طويب,محبط,محب,اهدر,تعسر,متشكيش,انذل,مفقوع
8,فؤش,ندمان,اسفوخس,حزنان,لاتحزن,اسف,يقرفكم,انبساط,ننصدم,هكتئب,أسف,سى,ولرغبتك,محسي,احبط,غدر,تعطون,متشوه,اقباض,جوعتنا
9,اإل,هيندم,اسفل,خسآر,حزف,ااسف,لغم,بتبضن,نادين,زهيمر,تسف,أسخف,طييب,محاج,محبى,در,تعسو,متشغليش,انغلاق,مف


In [8]:
df = generate_emotion_lexicon(embedding=egy_embedding, emotion="anger", min_cluster_size=9, num_words_to_generate=100)
df

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,9
0,بآ,كره,حقد,حسد,طمع,حاقد,غضب,عداء,كراهية,بغض,غاضب
1,ئ,عكره,نحقد,نحسد,طمعان,حاقضين,غضبة,أعداء,كراهي,بغيض,غاض
2,بزيئ,أكره,تحقد,احسد,مطمع,أنف,أغضب,اعداء,كراهيه,بغ,غا
3,ثح,اكره,حاقد,تتحسد,جشع,حاسد,غضبان,يااعداء,ناهية,نغض,غامض
4,آ,تزكره,حقكك,حسل,يبأ,انف,عضب,فداء,إكراه,يقاتلونكم,غاضني
5,ᴉ,عفكره,حاقضين,حسود,يريكم,حاق,مقت,اخاء,ماهية,بغي,لاتغضب
6,لؤلؤ,هكرهك,كراهية,حاسد,ابجار,اثرالعراق,قضب,غداء,تثنية,بغا,يؤخذ
7,رثا,كرهتنا,حقار,ماشاءلله,طمأن,حقد,لاتغضب,عمداء,وصية,زعزعة,غاظ
8,زىى,بكره,كراهي,حسره,ماجمع,كائد,ضب,داء,لآينتمون,أجج,غاو
9,بءس,مذكره,وغل,حسر,انتفاع,غظ,أرفع,عداو,نبذ,غشاو,عضب


# Lev

In [None]:
df = generate_emotion_lexicon(embedding=lev_embedding, emotion="disgust", min_cluster_size=5, num_words_to_generate=100)
df

# IRQ

In [None]:
df = generate_emotion_lexicon(embedding=irq_embedding, emotion="sadness", min_cluster_size=9, num_words_to_generate=100)
df

In [None]:
df = generate_emotion_lexicon(embedding=irq_embedding, emotion="surprise", min_cluster_size=5, num_words_to_generate=100)
df