In [1]:
import os
import numpy as np 
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/vtex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def TextTreatment(text):
    
    text = RemoveLowerCase(text)
    text = RemovePonctuation(text)
    text = RemoveUnderlines(text)
    text = RemoveStopWords(text)
    text = Lemmatize(text, FrenchStemmer())
    
    # Things to consider: rare words removal 
    
    return text

In [3]:
def RemoveLowerCase(text):
    return ' '.join([x.lower() for x in text.split()])

In [4]:
def RemovePonctuation (text):
    return re.sub('\W+',' ', text)

In [5]:
def RemoveUnderlines (text):
    return re.sub('_','', text)

In [6]:
def RemoveStopWords(text):
    stop = stopwords.words('french', 'english')
    return ' '.join([x for x in text.split(' ') if x not in stop])

In [7]:
def Lemmatize(text, stemmer):
    return ' '.join([stemmer.stem(w) for w in text.split(' ')])

In [8]:
path = '../data/raw/node_information/text'
no_nodes = 33226

files = []
for i in range(no_nodes):
    files.append(os.path.join(path, str(i) + '.txt'))

In [9]:
documents = []

for i in range(len(files)):
    
    if i>0 and i%1000 == 0:
        print('Reading file number ' + str(i))
    
    f = open(files[i], "r", errors = 'ignore')
    text = f.read()
    text = TextTreatment(text)
    documents.append(text)

Reading file number 1000
Reading file number 2000
Reading file number 3000
Reading file number 4000
Reading file number 5000
Reading file number 6000
Reading file number 7000
Reading file number 8000
Reading file number 9000
Reading file number 10000
Reading file number 11000
Reading file number 12000
Reading file number 13000
Reading file number 14000
Reading file number 15000
Reading file number 16000
Reading file number 17000
Reading file number 18000
Reading file number 19000
Reading file number 20000
Reading file number 21000
Reading file number 22000
Reading file number 23000
Reading file number 24000
Reading file number 25000
Reading file number 26000
Reading file number 27000
Reading file number 28000
Reading file number 29000
Reading file number 30000
Reading file number 31000
Reading file number 32000
Reading file number 33000


# TF-IDF 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
import scipy

In [None]:
## Create Vocabulary
vocabulary = set()
for i, doc in enumerate(documents):
    vocabulary.update(doc.split(' '))
    
    if i>0 and i%1000 == 0:
        print('Reading document number ' + str(i))
    
    
vocabulary = list(vocabulary)

In [None]:
len(vocabulary)

In [None]:
# Intializating the tfIdf model
tfidf = TfidfVectorizer(vocabulary=vocabulary)

# Fit the TfIdf model
tfidf.fit(documents)

# Transform the TfIdf model
tfidf_tran=tfidf.transform(documents)

In [None]:
#Save sparse matrix for future use
scipy.sparse.save_npz("../results/models/tf-idf_matrix", tfidf_tran)

# Doc2Vec Model Train

In [10]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

In [None]:
max_epochs = 30
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1,
                workers=12)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch+1))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("../results/models/doc2vec/vec100.model")
print("Model Saved")

iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
