# Building word2vec and fasttext models 

This notebook was used to build the word embedding models. It used the gensim package to build the models. Text is tokenized using Spacy. 

In [None]:
import pickle
from gensim.models import FastText, Word2Vec
import time

In [None]:
# spacy tokenizer for word and sentence tokenization
import spacy

nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

def use_spacy(s, TYPE):
    res = nlp(s)
    
    if TYPE == 'word':
        return([t.text for t in res])
    if TYPE == 'sent':
        return([sent.string.strip() for sent in res.sents])



In [None]:
# load data, which is the corpus as a dict of reports
dict_text = pickle.load(open('','rb'))

In [None]:
start_time = time.time()

tokenized_doc_list = [use_spacy(v, 'sent') for v in dict_text.values()]
sentence_list = [sent for doc in tokenized_doc_list for sent in doc]
tokenized_sentences_by_word = [use_spacy(sent, 'word') for sent in sentence_list]

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()

model_ft = FastText(tokenized_sentences_by_word, size = 1200, sg = 1, window = 7, seed = 2018,
                 min_count = 5, sorted_vocab = 1, min_n = 3, max_n = 8, word_ngrams = 1, workers = 10)
model_ft.save('')
del(model_ft)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()

model_w2v = Word2Vec(tokenized_sentences_by_word, size = 1200, sg = 1, window = 7, seed = 2018,
                    min_count = 5, sorted_vocab = 1, workers = 10)
model_w2v.save('') 
del(model_w2v)

print("--- %s seconds ---" % (time.time() - start_time))