## Train word2vec from raw article IV reports using Gensim

#### Terminologies:
1. raw_doc: unprocessed raw document from txt file

#### Features:
1. filter out certain punctuations
2. replace numeric values with "numeric_value"
3. choose embedding size of 100 due to small corpus size

### Load dictionary and pre-built functions

In [106]:
import sys
import os

import re
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import matplotlib.pyplot as plt

import gensim
import spacy
from gensim.models.word2vec import Word2Vec

In [21]:
## global folder path 
data_folder = '../../data/'
model_folder = '../../model/'
raw_data_path = os.path.join(data_folder,'raw/article_IV_corpus.txt')
data_processed_folder = os.path.join(data_folder,'processed')
results_folder = os.path.join(data_folder,'results','topic_model_results')

nlp = spacy.load('en')
#nlp.pipeline = [nlp.tagger, nlp.sentencizer]

### Load and process original text using spacy

In [3]:
with open(raw_data_path,'r',encoding='utf8') as f:
    raw_doc = f.readlines()
    raw_doc = [l.strip(' \n') for l in raw_doc if len(l)>50]

print('Length of raw documents {}'.format(len(raw_doc)))

Length of raw documents 142564


In [99]:
def clean_sentence(sent):
    '''remove punctuations in sentences
       and reduce to lemma form'''
    
    sent = [word.lemma_ for word in sent if word.pos_ not in ('PUNCT')]
    sent = [re.sub('[0-9]*.*[0-9]','numeric_value', word) for word in sent]
    
    return sent


def prepare_data(raw_doc):
    '''filter and lemmantize using spacy'''
    lemma_doc = []
    
    for paragraph in raw_doc:        
        doc = nlp(paragraph)
        sents = list(doc.sents)
        sentence = list(map(lambda x: clean_sentence(x), sents))
        lemma_doc.append(sentence)
    
    ## flatten
    lemma_doc = [s for l in lemma_doc for s in l ]

    return lemma_doc

start_time = time.time()
processed_doc = prepare_data(raw_doc = raw_doc)
print("--{} seconds --".format(time.time()- start_time))

--2253.9872109889984 seconds --


In [127]:
pickle_out = open(os.path.join(data_processed_folder,'processed_doc_for_word2vec_training.p'),"wb")
pickle.dump(processed_doc, pickle_out)
pickle_out.close()

### Train w2v model using gensim

#### initialize model and build vocabulary 

In [117]:
n_dim = 100
window = 7 
downsampling = 1e-5
seed = 1
num_workers = os.cpu_count()    ## not sure if this is a good idea
min_count = 30 

aiv_w2v = Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=n_dim,
    min_count=min_count,
    window= window,
    sample=downsampling
)

## build the vocabulary
aiv_w2v.build_vocab(processed_doc)
corpus_count = aiv_w2v.corpus_count

#### train w2v model 

In [125]:
iteration = 200

start_time = time.time()

if gensim.__version__[0] =='1':
    aiv_w2v.train(processed_doc)
else:
    aiv_w2v.train(processed_doc,total_examples=corpus_count,epochs = iteration)
    
print("--{} seconds --".format(time.time()- start_time))

--574.172180891037 seconds --


#### save model

In [126]:
## save trained word2 to vect model 
aiv_w2v.save(os.path.join(model_folder,'word2vec','aiv.w2v'))

#### some test

In [130]:
len(vocabs)

6728

In [129]:
model = aiv_w2v.wv
vocabs = model.vocab.keys()
model.most_similar('corruption',topn=20)

[('fight', 0.7006235122680664),
 ('anti', 0.6929948329925537),
 ('anticorruption', 0.6784272789955139),
 ('governance', 0.6521619558334351),
 ('fraud', 0.6335301995277405),
 ('crime', 0.6289873123168945),
 ('judiciary', 0.6151541471481323),
 ('judicial', 0.6075106859207153),
 ('investigation', 0.6053670048713684),
 ('combat', 0.594965934753418),
 ('evasion', 0.555034339427948),
 ('perception', 0.5436075329780579),
 ('prosecution', 0.5430848002433777),
 ('corrupt', 0.5416703224182129),
 ('climate', 0.535169780254364),
 ('prosecute', 0.5312641263008118),
 ('laundering', 0.5267109870910645),
 ('procurement', 0.5227187275886536),
 ('enforcement', 0.5216412544250488),
 ('investigate', 0.5117053985595703)]