### Import Libraries

In [61]:
import os
from gensim.models import Word2Vec
from gensim.models import FastText
import pandas as pd
from IPython.display import display, HTML

In [62]:
model_folder="models"
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [63]:
folder = "TP_ISD2020"
domains = ["QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl", "QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl"]

### Convert each file to a list of lists

In [64]:
data = dict()

for d in domains:
    path = os.path.join(folder, d)
    data_ = []
    print(" [.] Path to data:", path)
    with open(path, "r", encoding='utf-8') as f:
        for line in f:
            stripped_line = line.strip()
            line_list = stripped_line.split()
            data_.append(line_list)
            
    data[d] = data_

 [.] Path to data: TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl
 [.] Path to data: TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl


# Create and Save models

## Word2Vec Skip-Gram

In [65]:
for d in domains:
    model = Word2Vec(data[d], min_count=1, sg=1, size=100, window=15)
    model.train(data[d], total_examples=model.corpus_count, epochs=model.epochs)
    description = "FrenchMed" if "FrenchMed" in d else "FrenchPress"
    model.save(os.path.join(model_folder, "model_sg_word2vec_" + description + ".model"))

## Word2Vec Cbow

In [66]:
for d in domains:
    model = Word2Vec(data[d], min_count=1, sg=0, size=100, window=15)
    model.train(data[d], total_examples=model.corpus_count, epochs=model.epochs)
    description = "FrenchMed" if "FrenchMed" in d else "FrenchPress"
    model.save(os.path.join(model_folder, "model_cbow_word2vec_" + description + ".model"))

## FastText Cbow

In [67]:
for d in domains:
    model = FastText(size=100, window=15, min_count=1)
    model.build_vocab(sentences=data[d])
    model.train(sentences=data[d], total_examples=len(data[d]), epochs=10)
    description = "FrenchMed" if "FrenchMed" in d else "FrenchPress"
    model.save(os.path.join(model_folder, "model_fasttext_" + description + ".model"))

# Evaluation

In [68]:
def evaluate_same_corpus(description, models, words=["patient", "traitement", "maladie", "solution", "jaune"]):
    similarities = dict()
    for word in words:
        similarities[word] = pd.DataFrame()
        similarities[word].columns.name = '-'.join([word, description])
        for model, model_name in models:
            similars = model.wv.most_similar(word)
            similar_string = [s[0] for s in similars]
            similar_score = [s[1] for s in similars]
            similarities[word][model_name + "_string"] = similar_string
            similarities[word][model_name + "_score"] = similar_score
            
    return similarities

In [72]:
def evaluate_same_approach(model_name, domains, words=["patient", "traitement", "maladie", "solution", "jaune"]):
    similarities = dict()
    for d in domains:
        description = "FrenchMed" if "FrenchMed" in d else "FrenchPress"
        tag = '-'.join([model_name, description])
        similarities[tag] = pd.DataFrame()
        similarities[tag].columns.name = tag
        
        if "fasttext" in model_name:
            model = FastText.load(os.path.join(model_folder, model_name + description + ".model"))
        else:
            model = Word2Vec.load(os.path.join(model_folder, model_name + description + ".model"))

        for word in words:
            similars = model.wv.most_similar(word)
            similar_string = [s[0] for s in similars]
            similar_score = [s[1] for s in similars]
            similarities[tag][word + "_string"] = similar_string
            similarities[tag][word + "_score"] = similar_score
            
    return similarities

## Same corpus

In [70]:
for d in domains:
    description = "FrenchMed" if "FrenchMed" in d else "FrenchPress"
    word2vec_sg = Word2Vec.load(os.path.join(model_folder, "model_sg_word2vec_" + description + ".model"))
    word2vec_cbow = Word2Vec.load(os.path.join(model_folder, "model_cbow_word2vec_" + description + ".model"))
    fasttext = FastText.load(os.path.join(model_folder, "model_fasttext_" + description + ".model"))
    
    similarities = evaluate_same_corpus(description, [(word2vec_sg, "word2vec_sg"), (word2vec_cbow, "word2vec_cbow"), (fasttext, "fasttext")])
    for word in similarities.keys():
        display(HTML(similarities[word].to_html()))

patient-FrenchMed,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,carte,0.977923,TYSABRI,0.999779,segments,0.999709
1,conséquent,0.973757,tout,0.999686,Documents,0.999702
2,avoir,0.972418,Le,0.999665,Patients,0.999699
3,lui,0.971186,Il,0.999652,ciments,0.999699
4,informer,0.970642,femmes,0.999639,hurlements,0.999688
5,interrompre,0.97025,qu,0.999634,éléments,0.999669
6,bien,0.969886,tous,0.999628,eléments,0.999668
7,car,0.968724,pris,0.999624,pansements,0.999651
8,signe,0.967864,grossesse,0.999618,ligaments,0.999645
9,prévenir,0.967807,aucun,0.999593,aliments,0.999623


traitement-FrenchMed,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,test,0.901133,autres,0.998941,Traitement,0.999992
1,antérieur,0.89442,recommandé,0.998624,traitements,0.999819
2,surveillé,0.886625,que,0.998303,Traitements,0.999743
3,confirmé,0.882728,particulier,0.998109,Taaitement,0.999732
4,expérimenté,0.882402,conduire,0.997918,Allaitement,0.998652
5,début,0.879111,n,0.997895,évitement,0.998564
6,contrôle,0.877349,bénéfices,0.997817,allaitement,0.998052
7,instauré,0.876748,informés,0.997792,événement,0.998009
8,interrompu,0.876257,tant,0.997503,événements,0.997171
9,approprié,0.875876,VIH,0.997447,battements,0.996631


maladie-FrenchMed,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,infection,0.901079,modification,0.999601,enfance,0.999956
1,plaques,0.896646,risque,0.999584,préférence,0.999955
2,avancé,0.896564,liés,0.99952,expérience,0.999954
3,stade,0.892027,effet,0.999461,Expérience,0.999954
4,VIH,0.887273,cancer,0.999454,prévue,0.999953
5,liée,0.883668,celle,0.99941,manque,0.99995
6,Parkinson,0.880918,TYSABRI,0.999409,Absence,0.999944
7,immunitaire,0.879649,qu,0.999405,évidence,0.999944
8,antirétroviraux,0.878987,dans,0.999386,obtenue,0.999941
9,sclérose,0.87855,durée,0.999381,excitabilité,0.999941


solution-FrenchMed,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,contient,0.965042,microgrammes,0.999479,perfusion,0.999422
1,goutte,0.962821,contient,0.999047,Dissolution,0.999303
2,diluer,0.962457,20,0.999004,Perfusion,0.999172
3,fournie,0.958627,buvable,0.998963,solutions,0.998492
4,poudre,0.955832,150,0.99862,diffusion,0.997584
5,lépirudine,0.950154,Chaque,0.998508,confusion,0.997259
6,fourni,0.946734,300,0.998368,perfusions,0.996999
7,blanche,0.946369,jour,0.998114,Solution,0.996999
8,veine,0.945749,pelliculé,0.997708,evolution,0.99697
9,Chaque,0.944828,x,0.997677,évolution,0.996926


jaune-FrenchMed,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,pâle,0.963501,contenant,0.999572,fluoxétine,0.999961
1,chlorhydrique,0.956386,tolcapone,0.99953,formule,0.999961
2,15ml,0.951436,24,0.999476,Fluoxétine,0.99996
3,orange,0.950486,21,0.999467,thermostable,0.999954
4,E433,0.946601,Polysorbate,0.99939,intraveineuse,0.999951
5,sucre,0.945872,sodium,0.999381,microcristalline,0.999947
6,caoutchouc,0.94569,Comprimés,0.999379,diluée,0.999947
7,Hydroxyde,0.943696,méthyle,0.999302,Journée,0.999942
8,eau,0.94316,7,0.999301,diméthylaminoazobenzène,0.999939
9,polyéthylène,0.942073,E,0.999284,pool,0.999939


patient-FrenchPress,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,médicament,0.807531,garer,0.683697,impatient,0.982966
1,contaminé,0.794172,hémorragie,0.675379,patientent,0.981011
2,contaminants,0.777883,DVD,0.67326,abstient,0.963993
3,patients,0.768624,voler,0.66739,détient,0.963139
4,soignant,0.766846,puce,0.664566,impatientent,0.961047
5,ressurgit,0.762682,sciemment,0.663758,réconcilient,0.960317
6,compréhensible,0.758324,Girerd,0.660069,soutient,0.960261
7,prescription,0.757893,payé,0.657985,initient,0.958936
8,imputé,0.756361,accouché,0.657183,contient,0.958871
9,fiable,0.75575,conneries,0.65626,ratifient,0.953563


traitement-FrenchPress,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,frottis,0.662543,mariage,0.803042,promptement,0.96702
1,concluent,0.659412,non-prolifération,0.791776,concrètement,0.96461
2,obéissance,0.654948,collectif,0.776806,recrutement,0.961306
3,reproductive,0.654436,moyen,0.762019,doctement,0.958975
4,équitables,0.653533,système,0.757915,comportement,0.955171
5,sensibiliser,0.652806,statut,0.753529,strictement,0.955124
6,élémentaire,0.652442,jalon,0.751947,dégagement,0.955036
7,miel,0.645368,renforcement,0.751443,plafonnement,0.952434
8,concrétise,0.644913,fonctionnement,0.744897,rayonnement,0.95221
9,multilatéral,0.643493,additionnel,0.732599,subitement,0.951592


maladie-FrenchPress,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,SRAS,0.71711,douleur,0.754067,malnutrie,0.889943
1,virus,0.696305,commande,0.734919,malade,0.880035
2,aiguë,0.694524,perte,0.712671,trilogie,0.855643
3,causé,0.685805,catastrophe,0.709574,archéologie,0.85247
4,pneumopathie,0.668068,population,0.703147,médecine,0.837039
5,dus,0.661204,vente,0.691775,magie,0.835811
6,coronavirus,0.661172,puissance,0.676601,fantaisie,0.833449
7,Alzheimer,0.660359,planète,0.667684,bactériologie,0.823869
8,cancers,0.647317,découverte,0.658454,folie,0.816265
9,épidémie,0.646881,proportion,0.656809,asphyxie,0.814243


solution-FrenchPress,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,pacifique,0.758632,règle,0.805658,résolution,0.98091
1,consensuelle,0.699111,alternative,0.776692,révolution,0.977477
2,constructif,0.690104,crédibilité,0.776058,dissolution,0.976444
3,acceptera,0.685585,institution,0.768307,évolution,0.975659
4,garantissant,0.653698,ambition,0.764432,dilution,0.97135
5,centrafricaine,0.636153,perspective,0.764339,pollution,0.96915
6,sincère,0.634411,responsabilité,0.757106,caution,0.966552
7,renoncera,0.633869,rassembleuse,0.755659,résorption,0.949262
8,constructive,0.632943,résolution,0.755248,persécution,0.94761
9,conscient,0.631817,certaine,0.750471,réévaluation,0.945828


jaune-FrenchPress,word2vec_sg_string,word2vec_sg_score,word2vec_cbow_string,word2vec_cbow_score,fasttext_string,fasttext_score
0,maillot,0.891641,maillot,0.923762,Neptune,0.961772
1,emparé,0.82518,Oscar,0.874147,brune,0.949612
2,Pena,0.779855,ressuscité,0.869737,Jeune,0.945951
3,Armstrong,0.768948,endossé,0.859911,lune,0.942555
4,Morzine,0.766881,Néerlandais,0.858707,Saâdoune,0.926895
5,Lachhab,0.766247,Mark,0.857325,Saadoune,0.92033
6,pois,0.760283,sprint,0.857057,lagune,0.913695
7,Nazon,0.758742,Lachhab,0.850304,l'une,0.89823
8,McGee,0.757007,demi-finaliste,0.849532,dune,0.898083
9,grimpeur,0.750276,Elia,0.849398,Pampelune,0.896762


## Same approach

In [74]:
model_names = ["model_sg_word2vec_", "model_cbow_word2vec_", "model_fasttext_"]
for model_name in model_names:
    similarities = evaluate_same_approach(model_name, domains)
    for word in similarities.keys():
        display(HTML(similarities[word].to_html()))

model_sg_word2vec_-FrenchMed,patient_string,patient_score,traitement_string,traitement_score,maladie_string,maladie_score,solution_string,solution_score,jaune_string,jaune_score
0,carte,0.977923,test,0.901133,infection,0.901079,contient,0.965042,pâle,0.963501
1,conséquent,0.973757,antérieur,0.89442,plaques,0.896646,goutte,0.962821,chlorhydrique,0.956386
2,avoir,0.972418,surveillé,0.886625,avancé,0.896564,diluer,0.962457,15ml,0.951436
3,lui,0.971186,confirmé,0.882728,stade,0.892027,fournie,0.958627,orange,0.950486
4,informer,0.970642,expérimenté,0.882402,VIH,0.887273,poudre,0.955832,E433,0.946601
5,interrompre,0.97025,début,0.879111,liée,0.883668,lépirudine,0.950154,sucre,0.945872
6,bien,0.969886,contrôle,0.877349,Parkinson,0.880918,fourni,0.946734,caoutchouc,0.94569
7,car,0.968724,instauré,0.876748,immunitaire,0.879649,blanche,0.946369,Hydroxyde,0.943696
8,signe,0.967864,interrompu,0.876257,antirétroviraux,0.878987,veine,0.945749,eau,0.94316
9,prévenir,0.967807,approprié,0.875876,sclérose,0.87855,Chaque,0.944828,polyéthylène,0.942073


model_sg_word2vec_-FrenchPress,patient_string,patient_score,traitement_string,traitement_score,maladie_string,maladie_score,solution_string,solution_score,jaune_string,jaune_score
0,médicament,0.807531,frottis,0.662543,SRAS,0.71711,pacifique,0.758632,maillot,0.891641
1,contaminé,0.794172,concluent,0.659412,virus,0.696305,consensuelle,0.699111,emparé,0.82518
2,contaminants,0.777883,obéissance,0.654948,aiguë,0.694524,constructif,0.690104,Pena,0.779855
3,patients,0.768624,reproductive,0.654436,causé,0.685805,acceptera,0.685585,Armstrong,0.768948
4,soignant,0.766846,équitables,0.653533,pneumopathie,0.668068,garantissant,0.653698,Morzine,0.766881
5,ressurgit,0.762682,sensibiliser,0.652806,dus,0.661204,centrafricaine,0.636153,Lachhab,0.766247
6,compréhensible,0.758324,élémentaire,0.652442,coronavirus,0.661172,sincère,0.634411,pois,0.760283
7,prescription,0.757893,miel,0.645368,Alzheimer,0.660359,renoncera,0.633869,Nazon,0.758742
8,imputé,0.756361,concrétise,0.644913,cancers,0.647317,constructive,0.632943,McGee,0.757007
9,fiable,0.75575,multilatéral,0.643493,épidémie,0.646881,conscient,0.631817,grimpeur,0.750276


model_cbow_word2vec_-FrenchMed,patient_string,patient_score,traitement_string,traitement_score,maladie_string,maladie_score,solution_string,solution_score,jaune_string,jaune_score
0,TYSABRI,0.999779,autres,0.998941,modification,0.999601,microgrammes,0.999479,contenant,0.999572
1,tout,0.999686,recommandé,0.998624,risque,0.999584,contient,0.999047,tolcapone,0.99953
2,Le,0.999665,que,0.998303,liés,0.99952,20,0.999004,24,0.999476
3,Il,0.999652,particulier,0.998109,effet,0.999461,buvable,0.998963,21,0.999467
4,femmes,0.999639,conduire,0.997918,cancer,0.999454,150,0.99862,Polysorbate,0.99939
5,qu,0.999634,n,0.997895,celle,0.99941,Chaque,0.998508,sodium,0.999381
6,tous,0.999628,bénéfices,0.997817,TYSABRI,0.999409,300,0.998368,Comprimés,0.999379
7,pris,0.999624,informés,0.997792,qu,0.999405,jour,0.998114,méthyle,0.999302
8,grossesse,0.999618,tant,0.997503,dans,0.999386,pelliculé,0.997708,7,0.999301
9,aucun,0.999593,VIH,0.997447,durée,0.999381,x,0.997677,E,0.999284


model_cbow_word2vec_-FrenchPress,patient_string,patient_score,traitement_string,traitement_score,maladie_string,maladie_score,solution_string,solution_score,jaune_string,jaune_score
0,garer,0.683697,mariage,0.803042,douleur,0.754067,règle,0.805658,maillot,0.923762
1,hémorragie,0.675379,non-prolifération,0.791776,commande,0.734919,alternative,0.776692,Oscar,0.874147
2,DVD,0.67326,collectif,0.776806,perte,0.712671,crédibilité,0.776058,ressuscité,0.869737
3,voler,0.66739,moyen,0.762019,catastrophe,0.709574,institution,0.768307,endossé,0.859911
4,puce,0.664566,système,0.757915,population,0.703147,ambition,0.764432,Néerlandais,0.858707
5,sciemment,0.663758,statut,0.753529,vente,0.691775,perspective,0.764339,Mark,0.857325
6,Girerd,0.660069,jalon,0.751947,puissance,0.676601,responsabilité,0.757106,sprint,0.857057
7,payé,0.657985,renforcement,0.751443,planète,0.667684,rassembleuse,0.755659,Lachhab,0.850304
8,accouché,0.657183,fonctionnement,0.744897,découverte,0.658454,résolution,0.755248,demi-finaliste,0.849532
9,conneries,0.65626,additionnel,0.732599,proportion,0.656809,certaine,0.750471,Elia,0.849398


model_fasttext_-FrenchMed,patient_string,patient_score,traitement_string,traitement_score,maladie_string,maladie_score,solution_string,solution_score,jaune_string,jaune_score
0,segments,0.999709,Traitement,0.999992,enfance,0.999956,perfusion,0.999422,fluoxétine,0.999961
1,Documents,0.999702,traitements,0.999819,préférence,0.999955,Dissolution,0.999303,formule,0.999961
2,Patients,0.999699,Traitements,0.999743,expérience,0.999954,Perfusion,0.999172,Fluoxétine,0.99996
3,ciments,0.999699,Taaitement,0.999732,Expérience,0.999954,solutions,0.998492,thermostable,0.999954
4,hurlements,0.999688,Allaitement,0.998652,prévue,0.999953,diffusion,0.997584,intraveineuse,0.999951
5,éléments,0.999669,évitement,0.998564,manque,0.99995,confusion,0.997259,microcristalline,0.999947
6,eléments,0.999668,allaitement,0.998052,Absence,0.999944,perfusions,0.996999,diluée,0.999947
7,pansements,0.999651,événement,0.998009,évidence,0.999944,Solution,0.996999,Journée,0.999942
8,ligaments,0.999645,événements,0.997171,obtenue,0.999941,evolution,0.99697,diméthylaminoazobenzène,0.999939
9,aliments,0.999623,battements,0.996631,excitabilité,0.999941,évolution,0.996926,pool,0.999939


model_fasttext_-FrenchPress,patient_string,patient_score,traitement_string,traitement_score,maladie_string,maladie_score,solution_string,solution_score,jaune_string,jaune_score
0,impatient,0.982966,promptement,0.96702,malnutrie,0.889943,résolution,0.98091,Neptune,0.961772
1,patientent,0.981011,concrètement,0.96461,malade,0.880035,révolution,0.977477,brune,0.949612
2,abstient,0.963993,recrutement,0.961306,trilogie,0.855643,dissolution,0.976444,Jeune,0.945951
3,détient,0.963139,doctement,0.958975,archéologie,0.85247,évolution,0.975659,lune,0.942555
4,impatientent,0.961047,comportement,0.955171,médecine,0.837039,dilution,0.97135,Saâdoune,0.926895
5,réconcilient,0.960317,strictement,0.955124,magie,0.835811,pollution,0.96915,Saadoune,0.92033
6,soutient,0.960261,dégagement,0.955036,fantaisie,0.833449,caution,0.966552,lagune,0.913695
7,initient,0.958936,plafonnement,0.952434,bactériologie,0.823869,résorption,0.949262,l'une,0.89823
8,contient,0.958871,rayonnement,0.95221,folie,0.816265,persécution,0.94761,dune,0.898083
9,ratifient,0.953563,subitement,0.951592,asphyxie,0.814243,réévaluation,0.945828,Pampelune,0.896762
