# Training SVD and doc2vec Models (Phase 4)

#### Input:
   Various parameters found in EDA.
#### Output:
   Training and saving models 
#### Algorithm:
   a) Train tfidfVectorizer and SVD with components with variance_ratio > 0.95. <br>
   b) Train doc2vec from gensim for topic modelling.<br>
   c) SVD is used to penalise the queries outside of domain. <br>
   d) Doc2vec is used for ranking documents based on scores.<br>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import joblib
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.test.utils import get_tmpfile

In [2]:
class Train_models():
    
    def __init__(self, min_count=25, n_components=100, batch_size=10):
        self.n_components = n_components
        self.batch_size = batch_size
        self.df = pd.read_csv("dataset_processed.csv", names=["files", "keywords", "weights"])
        self.df.dropna(inplace=True)
        self.df.reset_index(inplace=True)
        self.tf = self.train_vectorizer(min_count=min_count)
        self.svd, self.svd_feature_matrix = self.train_svd()
        self.dv, self.doc2vec_feature_matrix = self.train_doc2vec(min_count=min_count)
        
    def save_models(self):
        joblib.dump(self.tf, "./model/tfidf_model.pkl")
        joblib.dump(self.svd, "./model/svd_model.pkl")
        joblib.dump(self.svd_feature_matrix, "./model/lsa_embeddings.pkl")
        self.dv.save("./model/doc2vec_model")
        joblib.dump(self.doc2vec_feature_matrix, "./model/doc2vec_embeddings.pkl")
        joblib.dump(self.df, "./model/dataset.pkl")
        print("All Models saved")
        
        
    def train_vectorizer(self, min_count):
        tf = TfidfVectorizer(analyzer="word", min_df=min_count, ngram_range=(1, 3), stop_words="english")
        print("Tfidf-trained")
        return tf.fit(self.df["keywords"])
    
    
    def train_svd(self):
        tfidf_matrix = self.tf.transform(self.df["keywords"])
        svd = TruncatedSVD(n_components=1356)
        print("Training SVD")
        latent_matrix = svd.fit_transform(tfidf_matrix)
        svd_feature_matrix = pd.DataFrame(latent_matrix, index=self.df["files"])
        print("SVD and SVD matrix trained")
        return svd, svd_feature_matrix
    
    def train_doc2vec(self, min_count):
        print("Trianing doc2vec")
        tagged_data = [TaggedDocument(words=word_tokenize("".join(_d)), tags=[i]) for i, _d in enumerate(self.df["keywords"])]
        doc2vec = Doc2Vec(vector_size=62, hs=1, sample=0,  min_count=min_count, epochs=10, seed=0, window=3, dm=1)
        doc2vec.build_vocab(tagged_data)
        doc2vec.train(tagged_data, total_examples=doc2vec.corpus_count, epochs=10)
        doc2vec_feature_matrix = pd.DataFrame(doc2vec.docvecs.vectors_docs, index=self.df["files"])
        fname = get_tmpfile("models/doc2vec_model")
        print("Doc2vec and Doc2vec matrix trained")
        return doc2vec, doc2vec_feature_matrix

In [3]:
%%time
train_models = Train_models()

Tfidf-trained
Training SVD
SVD and SVD matrix trained
Trianing doc2vec
Doc2vec and Doc2vec matrix trained
CPU times: user 2min 57s, sys: 18.4 s, total: 3min 15s
Wall time: 1min 37s


In [4]:
%%time
train_models.save_models()

All Models saved
CPU times: user 1.48 s, sys: 1.16 s, total: 2.65 s
Wall time: 6.45 s


# End of phase 4