In [1]:
import pandas as pd
import itertools
from glove import Corpus, Glove # creating a corpus object
import time
import json
import numpy as np
import sklearn.metrics.pairwise as metric
import scipy as sp
import random
import matplotlib.pyplot as plt
from gensim import KeyedVectors

In [2]:
# load sentences
with open("../data/docs.json") as file:
    list_sentances = json.load(file)
# original data
with open("../data/req_makeorg_environnement.json",encoding="utf8") as file:
    dict_req_makeorg = json.load(file)

In [3]:
list_tags =[[proposition["tags"][j]["label"] for j in range(len(proposition["tags"]))] for proposition in dict_req_makeorg["results"]]

# Train GloVe Embedding

In [4]:
windows = [2,3,4,5,6,7,8,9,10,15]
dim_emb = [i for i in range(10,210,20)]
learn_rate = [0.001]
df_glove_train = pd.DataFrame(itertools.product(windows,dim_emb,learn_rate), columns=["windows","dim_emb","learn_rate"])

In [6]:
print("Nombre de modeles glove à tester: ",len(df_glove_train))

Nombre de modeles glove à tester:  100


In [7]:
glove_models = []
train_times = []
list_corpus = []
for i in range(len(df_glove_train)): 
    print(i, end="\t")
    t0 = time.time()   
    corpus = Corpus() 
    corpus.fit(list_sentances, window=int(df_glove_train.iloc[i].windows))
    # train glove
    glove_model = Glove(no_components=int(df_glove_train.iloc[i].dim_emb), learning_rate=df_glove_train.iloc[i].learn_rate)
    glove_model.fit(corpus.matrix, epochs=30, no_threads=2)
    glove_model.add_dictionary(corpus.dictionary)
    
    glove_models.append(glove_model)
    train_times.append((time.time() - t0))
    list_corpus.append(corpus)

df_glove_train["glove_model"] = glove_models
df_glove_train["train_time"] = train_times
df_glove_train["corpus"] = corpus

0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	

In [3]:
# with open("glove.txt", "w") as f:
#     f.write("56 120")
#     f.write("\n")
#     for word in glove_model.dictionary:
#         f.write(word)
#         f.write(" ")
#         for i in range(0, glove_model.no_components):
#             f.write(str(glove_model.word_vectors[glove_model.dictionary[word]][i]))
#             f.write(" ")
#         f.write("\n")
#         
# KeyedVectors.load_word2vec_format('glove.txt',no_header=True)

<gensim.models.keyedvectors.KeyedVectors at 0x7f5d2fd99280>

In [None]:
df_glove_train["glove_model"].iloc[i].most_similar("dechet")

[('format', 0.2567788864965217),
 ('falloir', 0.23373309738443923),
 ('locataire', 0.23084728948647706),
 ('menager', 0.22608486495114918)]

# Evaluation

In [None]:
len(list_tags) == len(list_sentances)

True

In [None]:
# on ne garde que les proppsition tague
list_sentances = [list_sentances[i] for i in range(len(list_tags)) if len(list_tags[i]) > 0]
list_tags = [list_tags[i] for i in range(len(list_tags)) if len(list_tags[i]) > 0]
print("Nombre de propositions taguées: ",len(list_tags))

Nombre de propositions taguées:  6468


In [None]:
list_lemme = np.unique([list_sentances[i][j] for i in range(len(list_sentances)) for j in range(len(list_sentances[i]))])
print("Nombre de lemme apparaissant dans au moins une proposition taguée: ",len(list_lemme))

Nombre de lemme apparaissant dans au moins une proposition taguée:  6306


In [None]:
list_tags_unique = np.unique([list_tags[i][j] for i in range(len(list_tags)) for j in range(len(list_tags[i]))])
print("Nombre distinct de tags: ", len(list_tags_unique))

Nombre distinct de tags:  67


In [None]:
word_dictionary = list(corpus.dictionary)
print("Nombre de lemme: ",len(word_dictionary))
word_dictionary = [word for word in word_dictionary if word in list_lemme]
print("Nombre de lemme apparaissant dans au moins une proposition taguée: ",len(word_dictionary))

Nombre de lemme:  7614
Nombre de lemme apparaissant dans au moins une proposition taguée:  6306


In [None]:
test_size = 1000
dictionary_test = {word: corpus.dictionary[word] for word in random.sample(word_dictionary,test_size)}

In [None]:
mat_tag_distance = np.array([[np.sum([1 if tag in list_tags[i] and word in list_sentances[i]
                                      else 0
                                      for i in range(len(list_sentances))]) for tag in list_tags_unique] 
                             for word in dictionary_test])

In [None]:
print("Dimension matrice de distance lemme/tag",mat_tag_distance.shape)

Dimension matrice de distance lemme/tag (1000, 67)


In [None]:
vect_null = [i for i in range(mat_tag_distance.shape[0]) if np.all(mat_tag_distance[i] == 0)]
word_no_tag = [list(dictionary_test.keys())[i] for i in vect_null]

In [None]:
matx_tag_similiraty = np.zeros((mat_tag_distance.shape[0],mat_tag_distance.shape[0]))
np.fill_diagonal(matx_tag_similiraty,1)
for i in range(mat_tag_distance.shape[0]):
    print(i, end="\r")
    for j in range(i):
#         matx_tag_similiraty[i][j] = np.inner(mat_tag_distance[i],mat_tag_distance[j])/(np.linalg.norm(mat_tag_distance[i]) * np.linalg.norm(mat_tag_distance[j]))
        matx_tag_similiraty[i][j] = 1-sp.spatial.distance.cosine(mat_tag_distance[i],mat_tag_distance[j])
        matx_tag_similiraty[j][i] = matx_tag_similiraty[i][j] 

999

In [None]:
print("Moyenne : ",np.mean(matx_tag_similiraty))
print("Et : ",np.sqrt(np.var(matx_tag_similiraty)))
matx_tag_similiraty

Moyenne :  0.4539556514308649
Et :  0.1877179504473051


array([[1.        , 0.32732684, 0.43643578, ..., 0.55154976, 0.83621785,
        0.35856858],
       [0.32732684, 1.        , 0.5       , ..., 0.5055037 , 0.37159094,
        0.36514837],
       [0.43643578, 0.5       , 1.        , ..., 0.556372  , 0.38320316,
        0.36514837],
       ...,
       [0.55154976, 0.5055037 , 0.556372  , ..., 1.        , 0.5812426 ,
        0.45971828],
       [0.83621785, 0.37159094, 0.38320316, ..., 0.5812426 , 1.        ,
        0.3561753 ],
       [0.35856858, 0.36514837, 0.36514837, ..., 0.45971828, 0.3561753 ,
        1.        ]])

In [None]:
mse = []
list_matx_glove_similiraty = []
for i in range(len(df_glove_train)):
#     print(i,end="\t")
    matx_glove_similiraty = np.zeros((mat_tag_distance.shape[0],mat_tag_distance.shape[0]))
    np.fill_diagonal(matx_glove_similiraty,1)
    for k in range(mat_tag_distance.shape[0]):
        print((i,k), end="\r")
        for j in range(k):
            matx_glove_similiraty[k][j] = 1-sp.spatial.distance.cosine(df_glove_train["glove_model"].iloc[i].word_vectors[df_glove_train["corpus"].iloc[i].dictionary[list(dictionary_test.keys())[k]]],
                                                                       df_glove_train["glove_model"].iloc[i].word_vectors[df_glove_train["corpus"].iloc[i].dictionary[list(dictionary_test.keys())[j]]])
            matx_glove_similiraty[j][k] = matx_glove_similiraty[k][j] 
    
 
    mse.append(np.mean((matx_glove_similiraty - matx_tag_similiraty)**2)/2)
    list_matx_glove_similiraty.append(matx_glove_similiraty)
df_glove_train["mse"] = mse
df_glove_train["rmse"] = np.sqrt(mse)

(77, 450)

In [None]:
print([np.mean(np.abs(mat)) for mat in list_matx_glove_similiraty])
[np.sqrt(np.var(np.abs(mat))) for mat in list_matx_glove_similiraty]

In [None]:
#MSE
#windows
df_glove_train.groupby("windows")["rmse"].mean().plot(title="RMSE en fonction de la fenêtre")
plt.show()
#dim_emb
df_glove_train.groupby("dim_emb")["rmse"].mean().plot(title="RMSE en fonction de la dimension")
plt.show()
#learn_rate
df_glove_train.groupby("learn_rate")["rmse"].mean().plot(title="RMSE en fonction du taux d'apprentissage")
plt.show()

In [None]:
#windows
df_glove_train.groupby("windows")["train_time"].mean().plot(title="Train time en fonction de la fenêtre")
plt.show()
#dim_emb
df_glove_train.groupby("dim_emb")["train_time"].mean().plot(title="Train time en fonction de la dimension")
plt.show()
#learn_rate
df_glove_train.groupby("learn_rate")["train_time"].mean().plot(title="Train time en fonction du taux d'apprentissage")
plt.show()

In [None]:
print(len(corpus.dictionary.values()))
print(len(dictionary_test.values()))

In [None]:
best_glove_model = df_glove_train.loc[df_glove_train["mse"] == min(df_glove_train["mse"]),"glove_model"].iloc[0]

# Test empirique

In [None]:
lemme_test = "corruption"

In [None]:
# Test empirique
best_glove_model.most_similar(lemme_test, number=10)

In [None]:
df_glove_train["glove_model"].iloc[0].most_similar(lemme_test, number=10)

In [None]:
N = 10
index_lemme_test = [i for i in range(len(list(dictionary_test))) if list(dictionary_test)[i] == lemme_test][0]
indices = np.argpartition(matx_tag_similiraty, -N, axis=1)[:, -N:]
[(list(dictionary_test)[j],matx_tag_similiraty[index_lemme_test][j]) for j in indices[index_lemme_test]]

In [None]:
list(dictionary_test.keys())