In [1]:
%matplotlib inline


Doc2Vec Model
=============

Tuto : https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html




In [2]:
import os
from tqdm import tqdm
import gensim
import glob
import numpy as np



In [3]:
from nltk.tokenize import word_tokenize

def read_corpus(directory, tokens_only=False):
    for i, filename in tqdm(enumerate(os.listdir(directory))):
        with open(os.path.join(directory,filename), encoding="utf8") as f:
                tokens = word_tokenize(f.read(), language="french")
                if tokens_only:
                    yield tokens
                else:
                    # For training data, add tags
                    yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [4]:
dir_path_train = os.path.join(os.path.dirname(os.path.realpath("../D2V_corpus/train")),"train")
dir_path_test = os.path.join(os.path.dirname(os.path.realpath("../D2V_corpus/test")),"test")

train_corpus = list(read_corpus(dir_path_train))
test_corpus = list(read_corpus(dir_path_test, tokens_only = True))

2483it [00:23, 105.81it/s]
694it [00:06, 108.82it/s]


Training the Model
------------------

In [5]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=10, epochs=40)

In [6]:
model.build_vocab(train_corpus)

In [7]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Examples

In [8]:
### stuff for visualizing examples

train_filename_dict =  {ix: filename for ix, filename in enumerate(os.listdir("../D2V_corpus/train"))}
test_filename_dict =  {ix: filename for ix, filename in enumerate(os.listdir("../D2V_corpus/test"))}

def print_by_ix(ix, train=True):
    
    directory = '../D2V_corpus/train/' if train == True else '../D2V_corpus/test/'
    filename_dict = train_filename_dict if train == True else test_filename_dict
    with open(directory+filename_dict[ix], 'r', encoding='utf8') as f:
        print(f.read())
        
        
def similarity_example(n, test=True, print_contents=False):    
    
    if test == True:
        sample = np.random.choice(range(len(test_corpus)), n)
        for ix in sample:
            inferred_vector = model.infer_vector(test_corpus[ix])
            most_similar_ix = model.dv.most_similar(positive=[inferred_vector], topn=1)[0][0]
            similar_file = train_filename_dict[most_similar_ix]
            print(f'\n\nTEXT NR {ix}:\n {train_filename_dict[ix]}\n')
            if print_contents:
                print_by_ix(ix)
                print('\n\n')
            print(f'MOST SIMILAR:\n {train_filename_dict[most_similar_ix]}\n')
            if print_contents:
                print_by_ix(most_similar_ix)
            
    else:
        sample = np.random.choice(range(len(train_corpus)), n)
        for ix in sample:
            most_similar_ix = model.dv.most_similar(ix, topn=1)[0][0]
            similar_file = train_filename_dict[most_similar_ix]
            print(f'\n\nTEXT NR {ix}:\n {train_filename_dict[ix]}\n')
            if print_contents:
                print_by_ix(ix)
                print('\n\n')
            print(f'MOST SIMILAR:\n {train_filename_dict[most_similar_ix]}\n')
            if print_contents:
                print_by_ix(most_similar_ix)

In [9]:
similarity_example(30)



TEXT NR 441:
 1850_Dumas-Alexandre_La-Tulipe-noire_canon_CHUNK_10.txt

MOST SIMILAR:
 1851_Dumas-Alexandre_Ange-Pitou_canon_CHUNK_51.txt



TEXT NR 194:
 1850_Dumas-Alexandre-Pere_Dieu-dispose-I_canon_CHUNK_46.txt

MOST SIMILAR:
 1850_Dumas-Alexandre_Le-Vicomte-de-Bragelonne_canon_CHUNK_609.txt



TEXT NR 40:
 1850_Bassanville-Anais-de_Le-Soir-et-le-matin-de-la-vie_noncanon_CHUNK_46.txt

MOST SIMILAR:
 1850_Lamartine-Alphonse-de_Genevieve--histoire-d-une-servante_canon_CHUNK_1.txt



TEXT NR 31:
 1850_Bassanville-Anais-de_Le-Soir-et-le-matin-de-la-vie_noncanon_CHUNK_38.txt

MOST SIMILAR:
 1851_Souvestre-Emile_Les-derniers-paysans-Tome-1_noncanon_CHUNK_20.txt



TEXT NR 204:
 1850_Dumas-Alexandre-Pere_Dieu-dispose-I_canon_CHUNK_55.txt

MOST SIMILAR:
 1851_Dumas-Alexandre_Ange-Pitou_canon_CHUNK_82.txt



TEXT NR 88:
 1850_Dumas-Alexandre-Pere_Dieu-dispose-II_canon_CHUNK_30.txt

MOST SIMILAR:
 1850_Sue-Eugene_Les-Mysteres-du-peuple-Tome-V_canon_CHUNK_3.txt



TEXT NR 601:
 1850_Dumas-Al

Testing the Model
-----------------

In [10]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (246): «se séparer à leur tige et réfléchit profondément il réfléchit qu ’ il éter de son devoir d ’ empêcher autant qu ’ il être en lui le père billot de mettre à exécution le terrible vengeance qu ’ il méditer le premier moyen qui se présenter à l ’ esprit de pitou être de courir à boursonne et de prévenir m. isidor de danger qui l ’ attendre s ’ il se hasarder de côté de le ferme mais presque aussitôt il réfléchit à chose le premièr c ’ être qu ’ il n ’ avoir pas reçu de catherine mission de faire cela le second c ’ être que le danger pouvoir bien ne pas arrêter m. isidor puis quel certitude avoir pitou que le vicomte dont l ’ intention éter sans doute de se cacher venir par le route frayé à voiture et non par quelques-uns de ce petit sentier que suivre pour raccourcir leur chemin le bûcheron et le ouvrier de bois d ’ ailleurs en aller à le recherche d ’ isidor pitou abandonner catherine et pitou qui à tout prendre été fâché qu ’ il a

Vector Dataframe
-----------

In [11]:
path_corpus = os.path.join(os.path.dirname(os.path.realpath("../../Eval/chunked_corpus_lemma_txt")),"chunked_corpus_lemma_txt")
corpus = list(read_corpus(path_corpus, tokens_only = True))

31840it [04:40, 113.62it/s]


In [12]:
corpus[:2]

[['avant-propo',
  'c',
  '’',
  'être',
  'à',
  'vous',
  'que',
  'je',
  'dédie',
  'ce',
  'livre',
  'jeune',
  'fille',
  'qui',
  'le',
  'lire',
  'ce',
  'livre',
  'fruit',
  'de',
  'mon',
  'expérience',
  'et',
  'dans',
  'lequel',
  'je',
  'me',
  'suivre',
  'efforcée',
  'de',
  'vous',
  'peindre',
  'le',
  'vie',
  'non',
  'tel',
  'que',
  'vous',
  'le',
  'montre',
  'votre',
  'joyeux',
  'imagination',
  'qui',
  'ne',
  'le',
  'sème',
  'que',
  'de',
  'fleur',
  'et',
  'de',
  'plaisir',
  'mais',
  'tel',
  'qu',
  '’',
  'lui',
  'être',
  'réellement',
  'c',
  '’',
  'être',
  '-',
  'à-dir',
  'remplie',
  'de',
  'devoir',
  'sérieux',
  'et',
  'mêlée',
  'de',
  'pensée',
  'grave',
  'et',
  'austère',
  'avoir',
  'vous',
  'qui',
  'êtes',
  'assez',
  'heureux',
  'pour',
  'vous',
  'élever',
  'sous',
  'le',
  'direction',
  'sage',
  'et',
  'prudent',
  'de',
  'votre',
  'mère',
  'et',
  'qui',
  'de',
  'suivre',
  'so

In [38]:
import pandas as pd

In [39]:
vec_list = []

for chunk in tqdm(corpus):
    inferred_vector = model.infer_vector(chunk)
    vec_list.append(inferred_vector)

100%|████████████████████████████████████████████████████████████████████████████| 31840/31840 [31:33<00:00, 16.82it/s]


In [40]:
vec_list[:4]

[array([-2.08501840e+00,  1.93632042e+00, -1.77423984e-01,  1.28764153e+00,
         1.26323259e+00,  7.72830069e-01,  2.39116740e+00,  2.95062232e+00,
        -8.03535759e-01, -2.44617963e+00,  6.22525632e-01, -1.07309985e+00,
        -2.98933566e-01,  2.66819686e-01, -6.86719358e-01,  3.18083704e-01,
         2.25975299e+00, -6.04783416e-01, -1.70413566e+00, -2.63604760e+00,
        -2.27187133e+00, -1.62250495e+00, -1.58776894e-01,  3.84640241e+00,
        -8.52923036e-01, -8.50878134e-02,  3.57617348e-01, -3.90986800e-01,
         2.30350807e-01, -1.79222494e-01,  1.64538491e+00,  1.76426196e+00,
        -4.85340446e-01,  8.34426463e-01,  3.55873322e+00,  1.56932485e+00,
         9.45953131e-01, -1.94848466e+00, -8.87100697e-02, -3.33246589e-01,
         1.07818329e+00, -4.97678220e-02, -5.61580479e-01, -3.05047345e+00,
        -2.68679261e-01, -1.76228926e-01,  5.14984548e-01, -1.00536084e+00,
         1.04030931e+00, -8.37242484e-01, -8.25062811e-01,  3.63217056e-01,
        -1.0

In [41]:
oeuvre_list = []
for filename in os.listdir(path_corpus):
    oeuvre_list.append(filename)

In [60]:
data_df = pd.DataFrame(list(zip(oeuvre_list, vec_list)), columns=['oeuvre', "vecteur"])

data_df

Unnamed: 0,oeuvre,vecteur
0,1850_Bassanville-Anais-de_Le-Soir-et-le-matin-...,"[-2.0850184, 1.9363204, -0.17742398, 1.2876415..."
1,1850_Bassanville-Anais-de_Le-Soir-et-le-matin-...,"[-0.7951892, 0.52173185, -0.25921324, 1.631550..."
2,1850_Bassanville-Anais-de_Le-Soir-et-le-matin-...,"[-1.3432347, 0.4234592, -0.547549, 1.2612169, ..."
3,1850_Bassanville-Anais-de_Le-Soir-et-le-matin-...,"[1.614226, 0.90517324, -1.2468369, 0.37093368,..."
4,1850_Bassanville-Anais-de_Le-Soir-et-le-matin-...,"[-0.56765395, 2.8439364, -3.042496, 2.8399227,..."
...,...,...
31835,1885_Guy-de-Maupassant_Contes-du-jour-et-de-la...,"[-1.635327, 0.41148907, -0.972693, 1.7262673, ..."
31836,1885_Guy-de-Maupassant_Contes-du-jour-et-de-la...,"[-0.9986554, 0.3439337, 0.65486234, 2.2974365,..."
31837,1885_Guy-de-Maupassant_Contes-du-jour-et-de-la...,"[-1.4379919, 0.5551676, -1.0150168, 1.5651331,..."
31838,1885_Guy-de-Maupassant_Contes-du-jour-et-de-la...,"[0.2016208, -0.40469047, 2.2544434, 1.6676745,..."


In [63]:
data_df["vecteur"][0]

array([-2.08501840e+00,  1.93632042e+00, -1.77423984e-01,  1.28764153e+00,
        1.26323259e+00,  7.72830069e-01,  2.39116740e+00,  2.95062232e+00,
       -8.03535759e-01, -2.44617963e+00,  6.22525632e-01, -1.07309985e+00,
       -2.98933566e-01,  2.66819686e-01, -6.86719358e-01,  3.18083704e-01,
        2.25975299e+00, -6.04783416e-01, -1.70413566e+00, -2.63604760e+00,
       -2.27187133e+00, -1.62250495e+00, -1.58776894e-01,  3.84640241e+00,
       -8.52923036e-01, -8.50878134e-02,  3.57617348e-01, -3.90986800e-01,
        2.30350807e-01, -1.79222494e-01,  1.64538491e+00,  1.76426196e+00,
       -4.85340446e-01,  8.34426463e-01,  3.55873322e+00,  1.56932485e+00,
        9.45953131e-01, -1.94848466e+00, -8.87100697e-02, -3.33246589e-01,
        1.07818329e+00, -4.97678220e-02, -5.61580479e-01, -3.05047345e+00,
       -2.68679261e-01, -1.76228926e-01,  5.14984548e-01, -1.00536084e+00,
        1.04030931e+00, -8.37242484e-01, -8.25062811e-01,  3.63217056e-01,
       -1.07672071e+00,  

In [64]:
import pickle

data_df.to_pickle("../data_df.pkl")