# Word embeddings training

In [48]:
import gensim
from gensim.models import Word2Vec
import warnings
import string
import numpy as np
import os
from random import shuffle
import re
import urllib.request
import zipfile
warnings.filterwarnings("ignore")

In [49]:
#download the data
urllib.request.urlretrieve("https://perso.limsi.fr/neveol/TP_ISD2020.zip", filename="TP_ISD2020.zip")

('TP_ISD2020.zip', <http.client.HTTPMessage at 0x7fd7375bdd90>)

In [50]:
doc = open('TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl', encoding="utf8")

In [51]:
file_str = doc.read()

In [52]:
#replace numbers, new lines, carriege return with space string, escape character with empty str
def preprocess(str_val):
    str_val = re.sub("\r", " ", str_val)
    str_val = re.sub("\d+", " ", str_val)
    str_val = re.sub("\n", " ", str_val)
    str_val = re.sub("\uf0b7", "", str_val)

    for punc in string.punctuation:
        if punc != '.':
            str_val= str_val.replace(punc," ")
    sentences = str_val.split(".")
    
    filtered_sentences=[]
    for sentence in sentences:
        if len(sentence)>1:
            filtered_sentences.append(sentence.split())
    return filtered_sentences


In [53]:
filtered_sentences=preprocess(file_str)

In [54]:
filtered_sentences

[['EMEA',
  'H',
  'C',
  'PRIALT',
  'Qu',
  '’',
  'est',
  'ce',
  'que',
  'Prialt',
  'Prialt',
  'est',
  'une',
  'solution',
  'pour',
  'perfusion',
  'contenant',
  'le',
  'principe',
  'actif',
  'ziconotide',
  'à',
  'des',
  'concentrations',
  'de',
  'ou',
  'microgrammes',
  'par',
  'millilitre'],
 ['Dans',
  'quel',
  'cas',
  'Prialt',
  'est',
  'il',
  'utilisé',
  'Prialt',
  'est',
  'indiqué',
  'pour',
  'le',
  'traitement',
  'des',
  'douleurs',
  'intenses',
  'chroniques',
  'de',
  'long',
  'terme',
  'chez',
  'les',
  'patients',
  'nécessitant',
  'une',
  'analgésie',
  'intrarachidienne',
  'injection',
  'dans',
  'l',
  '’',
  'espace',
  'qui',
  'entoure',
  'la',
  'moelle',
  'épinière',
  'et',
  'le',
  'cerveau'],
 ['Comme',
  'le',
  'nombre',
  'de',
  'patients',
  'souffrant',
  'de',
  'douleurs',
  'chroniques',
  'nécessitant',
  'une',
  'analgésie',
  'intrarachidienne',
  'analgésiques',
  'injectés',
  'directement',
  'dans',


In [55]:
#Word 2 vec
model = Word2Vec(min_count=1,sg=1) #sg=1 means skipgram, else CBOW
model.build_vocab(filtered_sentences) 
model.train(filtered_sentences, total_examples=model.corpus_count, epochs=100)

(3244923, 4249900)

In [56]:
model.most_similar("douleurs")

[('intenses', 0.7738748788833618),
 ('chroniques', 0.7631170749664307),
 ('alopécie', 0.6888659596443176),
 ('hypersudation', 0.6888019442558289),
 ('prurit', 0.6831979751586914),
 ('suicides', 0.6814372539520264),
 ('crampes', 0.6786566972732544),
 ('chutes', 0.6769541501998901),
 ('variait', 0.6652883291244507),
 ('tentatives', 0.6627514362335205)]

In [57]:
#CBOW
model = Word2Vec(min_count=1,sg=0, workers=4) #sg=0 -> CBOW, sg here for skip-gram
model.build_vocab(filtered_sentences) 
model.train(filtered_sentences, total_examples=model.corpus_count, epochs=100)

(3246727, 4249900)

In [58]:
model.most_similar("douleurs")

[('chroniques', 0.823729395866394),
 ('intenses', 0.7985644340515137),
 ('foncées', 0.70596843957901),
 ('vomissements', 0.6912149786949158),
 ('nausées', 0.6790997385978699),
 ('abdominales', 0.6728881597518921),
 ('difficulté', 0.6667233109474182),
 ('prurit', 0.6663662195205688),
 ('variait', 0.6644868850708008),
 ('faiblesse', 0.6624826192855835)]