In [206]:
import sys
import os
sys.path.append(os.path.dirname(os.path.realpath("")))

In [214]:
from os.path import join

from gensim.models import KeyedVectors
import fasttext as ft
import pandas as pd
import numpy as np
import umap
import umap.plot
import hdbscan
from sklearn.cluster import DBSCAN

import utilities

pd.set_option('display.max_rows', 500)

In [208]:
lexicon = utilities.get_arabic_lexicon_data("..")

In [222]:
glf_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "glf.vec"))
egy_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "egy.vec"))
irq_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "irq.vec"))
nor_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "nor.vec"))
lev_embedding = KeyedVectors.load_word2vec_format(join("embeddings", "lev.vec"))

In [287]:
def generate_emotion_lexicon(embedding, emotion, min_cluster_size, num_words_to_generate=100, verbose=False, save=False, embedding_name=None):
    if save:
        assert embedding_name is not None, "embedding_name must not be None when trying to save" 
    
    lexicon = utilities.get_arabic_lexicon_data("..")
    def get(embedding, val, default=None):
        try:
            return embedding[val]
        except KeyError:
            return default

    emotions = list(lexicon.keys())
    if verbose:
        print(f"Emotions in lexicon {emotions}")
    assert emotion in lexicon, f"The emotion {emotion} is not in the Lexicon"

    vecs_non_filtered = list(map(lambda wrd: get(glf_embedding, wrd, default=None), lexicon[emotion]))
    if verbose:
        print(f"Number of words considered {sum(1 for vec in vecs_non_filtered if vec is not None)}/{len(vecs_non_filtered)}")
    vecs = list(filter(lambda x: x is not None, vecs_non_filtered))
    centroid = np.mean(vecs, axis=0)

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    cluster_labels = clusterer.fit_predict(vecs)

    if verbose:
        print(cluster_labels)
    cluster_to_words = {}
    for focus_cluster in np.unique(cluster_labels):
        clustered_vecs = [vec for cluster, vec in zip(cluster_labels, vecs) if cluster == focus_cluster]
        clustered_centroid = np.median(clustered_vecs, axis=0)
        cluster_to_words[focus_cluster] = list(zip(*glf_embedding.similar_by_vector(clustered_centroid, num_words_to_generate)))[0]

    if verbose:
        mapper = umap.UMAP().fit(vecs)
        umap.plot.points(mapper, labels=cluster_labels)
    df = pd.DataFrame(cluster_to_words)
    style = df.style.applymap(lambda x: "background-color: rgba(20, 20, 20, 1)" if x in lexicon[emotion] else "")
    if save:
        df.to_csv(f"emotion_lexicon/{embedding_name}_{emotion}_minclustersize={min_cluster_size}.csv")
    return style

## Problems

### Problem with using all words in generating similar words
### Solution: cluster the vector representation of words

In [289]:
df = generate_emotion_lexicon(embedding=glf_embedding, emotion="anger", min_cluster_size=100, num_words_to_generate=500)
df.data.iloc[:20]

Unnamed: 0,-1
0,فی
1,ضفدع
2,لعنبو
3,ض
4,نذل
5,***
6,قذف
7,شآء
8,آللـہ
9,ضب


### Problem with centroid idea
### Solution(?): Maybe decrease the min_cluster_size?
![img](https://www.researchgate.net/profile/Fotios-Katsilieris-2/publication/239926467/figure/fig4/AS:669426328301595@1536615082633/An-example-of-the-difference-between-a-convex-and-a-non-convex-region.ppm)


# Notes

### finding the centroid with the mean seems to give worse performance than finding it with the median. Probably since the mean is swayed by outlier vectors 

<br/>

### For some reason, Arabic embeddings favor ryhming words
### Possible reasons: Bad data, Arabic itself has ryhming words closeby, so CBOW and Skipgram think they're similar in meaning
![img](https://i.imgur.com/7XKYaBI.png)

# Glf

In [293]:
df = generate_emotion_lexicon(embedding=glf_embedding, emotion="anger", min_cluster_size=8, num_words_to_generate=500)

In [294]:
df = generate_emotion_lexicon(embedding=glf_embedding, emotion="joy", min_cluster_size=15, num_words_to_generate=500)

In [295]:
df = generate_emotion_lexicon(embedding=glf_embedding, emotion="disgust", min_cluster_size=3, num_words_to_generate=100)

# Egy

In [296]:
df = generate_emotion_lexicon(embedding=egy_embedding, emotion="joy", min_cluster_size=13, num_words_to_generate=500)

In [297]:
df = generate_emotion_lexicon(embedding=egy_embedding, emotion="fear", min_cluster_size=7, num_words_to_generate=500)

# Lev

In [298]:
df = generate_emotion_lexicon(embedding=lev_embedding, emotion="disgust", min_cluster_size=5, num_words_to_generate=500)

# NOR

In [299]:
df = generate_emotion_lexicon(embedding=nor_embedding, emotion="sadness", min_cluster_size=30, num_words_to_generate=100)

In [300]:
df = generate_emotion_lexicon(embedding=nor_embedding, emotion="anger", min_cluster_size=15, num_words_to_generate=100)