In [1]:
%matplotlib inline


Doc2Vec Model
=============

Tuto : https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html




In [103]:
import os
from tqdm import tqdm
import gensim
import glob
import numpy as np

In [72]:
from nltk.tokenize import word_tokenize

def read_corpus(directory, tokens_only=False):
    for i, filename in tqdm(enumerate(os.listdir(directory))):
        with open(os.path.join(directory,filename), encoding="utf8") as f:
                tokens = word_tokenize(f.read(), language="french")
                if tokens_only:
                    yield tokens
                else:
                    # For training data, add tags
                    yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [73]:
dir_path_train = os.path.join(os.path.dirname(os.path.realpath("../D2V_corpus/train")),"train")
dir_path_test = os.path.join(os.path.dirname(os.path.realpath("../D2V_corpus/test")),"test")

train_corpus = list(read_corpus(dir_path_train))
test_corpus = list(read_corpus(dir_path_test, tokens_only = True))

2483it [00:23, 103.83it/s]
694it [00:07, 97.00it/s] 


Training the Model
------------------

In [76]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [77]:
model.build_vocab(train_corpus)

In [79]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Examples

In [163]:
### stuff for visualizing examples

train_filename_dict =  {ix: filename for ix, filename in enumerate(os.listdir("../D2V_corpus/train"))}
test_filename_dict =  {ix: filename for ix, filename in enumerate(os.listdir("../D2V_corpus/test"))}

def print_by_ix(ix, train=True):
    
    directory = '../D2V_corpus/train/' if train == True else '../D2V_corpus/test/'
    filename_dict = train_filename_dict if train == True else test_filename_dict
    with open(directory+filename_dict[ix], 'r', encoding='utf8') as f:
        print(f.read())
        
        
def similarity_example(n, test=True, print_contents=False):    
    
    if test == True:
        sample = np.random.choice(range(len(test_corpus)), n)
        for ix in sample:
            inferred_vector = model.infer_vector(test_corpus[ix])
            most_similar_ix = model.dv.most_similar(inferred_vector, topn=1)[0][0]
            similar_file = train_filename_dict[most_similar_ix]
            print(f'\n\nTEXT NR {ix}:\n {filename_dict[ix]}\n')
            if print_contents:
                print_by_ix(ix)
                print('\n\n')
            print(f'MOST SIMILAR:\n {filename_dict[most_similar_ix]}\n')
            if print_contents:
                print_by_ix(most_similar_ix)
            
    else:
        sample = np.random.choice(range(len(train_corpus)), n)
        for ix in sample:
            most_similar_ix = model.dv.most_similar(ix, topn=1)[0][0]
            similar_file = train_filename_dict[most_similar_ix]
            print(f'\n\nTEXT NR {ix}:\n {train_filename_dict[ix]}\n')
            if print_contents:
                print_by_ix(ix)
                print('\n\n')
            print(f'MOST SIMILAR:\n {train_filename_dict[most_similar_ix]}\n')
            if print_contents:
                print_by_ix(most_similar_ix)

In [164]:
similarity_example(30)



TEXT NR 75:
 1850_Dumas-Alexandre-Pere_Dieu-dispose-II_canon_CHUNK_19.txt

MOST SIMILAR:
 1850_Sue-Eugene_Les-Mysteres-du-peuple-Tome-V_canon_CHUNK_47.txt



TEXT NR 653:
 1850_Dumas-Alexandre_Le-Vicomte-de-Bragelonne_canon_CHUNK_223.txt

MOST SIMILAR:
 1850_Dumas-Alexandre_Le-Vicomte-de-Bragelonne_canon_CHUNK_22.txt



TEXT NR 470:
 1850_Dumas-Alexandre_La-Tulipe-noire_canon_CHUNK_37.txt

MOST SIMILAR:
 1851_Dumas-Alexandre-Pere_Olympe-de-Cleves-I_canon_CHUNK_12.txt



TEXT NR 167:
 1850_Dumas-Alexandre-Pere_Dieu-dispose-I_canon_CHUNK_21.txt

MOST SIMILAR:
 1850_Dumas-Alexandre_Le-Vicomte-de-Bragelonne_canon_CHUNK_179.txt



TEXT NR 123:
 1850_Dumas-Alexandre-Pere_Dieu-dispose-II_canon_CHUNK_62.txt

MOST SIMILAR:
 1850_Lamartine-Alphonse-de_Genevieve--histoire-d-une-servante_canon_CHUNK_39.txt



TEXT NR 395:
 1850_Dumas-Alexandre-Pere_Le-Trou-de-l-Enfer_canon_CHUNK_59.txt

MOST SIMILAR:
 1850_Dumas-Alexandre-Pere_La-colombe_canon_CHUNK_15.txt



TEXT NR 255:
 1850_Dumas-Alexandre-P

Testing the Model
-----------------

In [84]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (477): «il voir dans le décret sur le prêtre « dire il rendre contre le conscience public il voir dans le décret sur le émigré « rendre contre le lien de famille un moyen d ’ essayer de pouvoir de roi le club de feuillant prépara et le directoire de pari signer contre ce décret un protestation dans lequel on prier louis xvi d ’ apposer son veto au décret concerner le prêtre on se rappeler que le constitution réserver à louis xvi ce droit de veto qui signer ce protestation l ’ homme qui le premier avoir attaqué le clergé le méphistophélè qui de son pied bot avoir cassé le glace talleyrand l ’ homme qui avoir faire depuis de le diplomatie à le loupe ne voir pas toujours très clair en révolution le bruit de veto se répandit d ’ avance le cordelier lancèrent en avant camille desmoulin ce lancier de le révolution qu ’ on trouve toujours prêt à planter son pique en plein but lui aussi faire son pétition mais bredouilleur impossible quand il essayer de 