# Information Retrieval (Phase 5)

In [1]:
import pandas as pd
import numpy as np
import joblib
import pickle
from gensim.models.doc2vec import Doc2Vec
from gensim.test.utils import get_tmpfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from multiprocessing import Pool
from scipy import sparse
import re
from dask import delayed
import dask.dataframe as dd
import os
import gc

In [2]:
class Information_Retrieval_Model():
    
    def __init__(self):
        self.dv = Doc2Vec.load("./model/doc2vec_model")
        self.tf = joblib.load("./model/tfidf_model.pkl")
        self.svd = joblib.load("./model/svd_model.pkl")
        self.svd_feature_matrix = joblib.load("./model/lsa_embeddings.pkl")
        self.svd_feature_matrix.reset_index(inplace=True)
        self.doctovec_feature_matrix = joblib.load("./model/doc2vec_embeddings.pkl")
        self.doctovec_feature_matrix.reset_index(inplace=True)
        self.df = df = joblib.load("./model/dataset.pkl")
    
    def get_message_tfidf_embedding_vector(self, message):
        message_array = self.tf.transform([message]).toarray()
        message_array = self.svd.transform(message_array)
        message_array = message_array.reshape(1, -1)
        return message_array
    
    def get_message_doctovec_embedding_vector(self, message):
        message_array = self.dv.infer_vector(doc_words=message.split(" "), epochs=100)
        message_array = message_array.reshape(1, -1)
        return message_array
    
    @staticmethod
    def get_similarity_scores(message_array, embeddings):

        embeddings = embeddings.drop(labels="files", axis=1)
        cosine_sim_matrix = cosine_similarity(X=message_array,
                                                           Y=embeddings,
                                                           dense_output=True)
        cosine_sim_matrix = pd.DataFrame(cosine_sim_matrix.reshape(-1, 1))
        cosine_sim_matrix.set_index(embeddings.index, inplace=True)
        cosine_sim_matrix.columns = ["cosine_similarity"]
        return cosine_sim_matrix

    def get_ensemble_similarity_scores(self, message):
        bow_message_array = self.get_message_tfidf_embedding_vector(message)
        semantic_message_array = self.get_message_doctovec_embedding_vector(message)

        bow_similarity = self.get_similarity_scores(bow_message_array, self.svd_feature_matrix)
        semantic_similarity = self.get_similarity_scores(semantic_message_array, self.doctovec_feature_matrix)

        ensemble_similarity = pd.merge(semantic_similarity, bow_similarity, left_index=True, right_index=True)
        ensemble_similarity.columns = ["semantic_similarity", "bow_similarity"]
        ensemble_similarity.index = self.df["files"]
        ensemble_similarity = ensemble_similarity.reset_index()
        return ensemble_similarity
    
    def query_similar_documents(self, message, n=10):
        similar_files = self.get_ensemble_similarity_scores(message)
        similar_files = similar_files[similar_files["bow_similarity"] > 0].reset_index()

        similar_files["doc2vec_similarity"] = similar_files["semantic_similarity"]*self.df["weights"]
        similar_files["svd_similarity"] = similar_files["bow_similarity"]*self.df["weights"]

        similar_files = similar_files.groupby("files").agg({"doc2vec_similarity": lambda x: sum(x), "svd_similarity": lambda x: sum(x)})
        similar_files["total"] = similar_files["doc2vec_similarity"] + similar_files["svd_similarity"]
        similar_files = similar_files.reset_index()
        return similar_files

In [3]:
ir = Information_Retrieval_Model()