# Training Doc2Vec model
1. Preprocess news to tokens.
2. Train the doc2vec model using gensim package.

In [1]:
import pandas as pd
import numpy as np
data = pd.read_pickle('../../../Data/ProcessedNews.pkl')
data.head()

Unnamed: 0,headline,date,content,tokens,tokens_remove_stopwords,content_length,le_content
0,Last-minute Christmas rush lifts UK retail sales,2017-01-10 00:00:00,Industry figures add to signs that the economy...,"[industry, figures, add, to, signs, that, the,...","[industry, figures, add, signs, economy, ended...",385.0,"[industry, figure, add, sign, economy, ended, ..."
1,Guarantee minimum wage for gig economy workers...,2017-01-31 00:00:00,Labour MP wants government to set up national ...,"[labour, mp, wants, government, to, set, up, n...","[labour, mp, wants, government, set, national,...",325.0,"[labour, mp, want, government, set, national, ..."
2,,2017-01-27 00:00:00,IG traders expect US markets to open higher US...,"[ig, traders, expect, us, markets, to, open, h...","[ig, traders, expect, us, markets, open, highe...",23.0,"[ig, trader, expect, u, market, open, higher, ..."
3,Why the UK economy could fare better in 2017 t...,2017-01-01 00:00:00,"From house prices to exports, there are reason...","[from, house, prices, to, exports, there, are,...","[house, prices, exports, reasons, cheerful, ye...",667.0,"[house, price, export, reason, cheerful, year,..."
4,,2017-02-08 00:00:00,"On the US oil stock numbers, David Morrison, s...","[on, the, us, oil, stock, numbers, david, morr...","[us, oil, stock, numbers, david, morrison, sen...",235.0,"[u, oil, stock, number, david, morrison, senio..."


In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import gensim

token_lst = list(data['le_content'])

lst = []
for a in token_lst:
    lst.append(' '.join(token for token in a))
    
# place word in tagged_data
tagged_data = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(lst)]

# Doc2Vec model
model = Doc2Vec(vector_size=10,workers=24,
                min_count = 10,
#                 alpha = 0.025,
#                 epochs = 50
               )

model.build_vocab(tagged_data)

# train model
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# save model
model.save("doc2vec_model")

In [3]:
# model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

V1_infer [ 0.04678507  0.02458915  0.0863471  -0.05936243  0.02843884 -0.05354042
 -0.09625445 -0.04301103  0.00876733  0.12412249]


In [4]:
# load model
model = gensim.models.doc2vec.Doc2Vec.load("doc2vec_model")
vecs=[]
for i in range(len(tagged_data)):
    vecs.append(model.docvecs[i])

In [5]:
data["vec"] = vecs
data.head()

Unnamed: 0,headline,date,content,tokens,tokens_remove_stopwords,content_length,le_content,vec
0,Last-minute Christmas rush lifts UK retail sales,2017-01-10 00:00:00,Industry figures add to signs that the economy...,"[industry, figures, add, to, signs, that, the,...","[industry, figures, add, signs, economy, ended...",385.0,"[industry, figure, add, sign, economy, ended, ...","[0.64053744, 0.17488109, 0.91740817, -1.005388..."
1,Guarantee minimum wage for gig economy workers...,2017-01-31 00:00:00,Labour MP wants government to set up national ...,"[labour, mp, wants, government, to, set, up, n...","[labour, mp, wants, government, set, national,...",325.0,"[labour, mp, want, government, set, national, ...","[0.70107186, -0.8586756, 0.36855114, -0.023900..."
2,,2017-01-27 00:00:00,IG traders expect US markets to open higher US...,"[ig, traders, expect, us, markets, to, open, h...","[ig, traders, expect, us, markets, open, highe...",23.0,"[ig, trader, expect, u, market, open, higher, ...","[0.059748065, -0.0040600086, 0.11090677, -0.16..."
3,Why the UK economy could fare better in 2017 t...,2017-01-01 00:00:00,"From house prices to exports, there are reason...","[from, house, prices, to, exports, there, are,...","[house, prices, exports, reasons, cheerful, ye...",667.0,"[house, price, export, reason, cheerful, year,...","[0.6242375, -0.79717505, 0.3246571, -0.5324528..."
4,,2017-02-08 00:00:00,"On the US oil stock numbers, David Morrison, s...","[on, the, us, oil, stock, numbers, david, morr...","[us, oil, stock, numbers, david, morrison, sen...",235.0,"[u, oil, stock, number, david, morrison, senio...","[0.48175463, 0.4265749, 0.27786314, -0.5772213..."


In [6]:
data.shape

(66034, 8)