In [1]:
from dask.distributed import get_client
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from dask_ml.metrics.pairwise import euclidean_distances
from bert_serving.client import BertClient
import string
import re
import dask.array as da
import numpy as np
import pandas as pd
import dask.dataframe as dd
import joblib

In [2]:
client = get_client("tcp://127.0.0.1:49384")

In [3]:
lemmatizer = WordNetLemmatizer()
stopwords_list = client.get_dataset('stop_words')
bert = BertClient()

In [4]:
def clean(text):
    text = text.replace(",", " ").replace(":", " ")
    text = str(text).lower()
    printable = set(string.printable)
    text = "".join(list(filter(lambda x: x in printable, text)))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join([word for word in tokens if word not in stopwords_list])

In [5]:
def substract_words(query, words_list):
    tokens = word_tokenize(query)
    return " ".join([token for token in tokens if token not in words_list])

In [6]:
def information_retrieval(query):
    
    """
    
    Algorithm:
    
    Step 1: Filter and Preprocess the query.
    Step 2: Initialize result = []
    Step 3: Read all the models.
    Step 4: Disproportionate the query into project, libs, frameworks, dbs, keywords.
    Step 3: Add the documents in the resultset if query keywords found in project title.
    Step 4: If more than 70% of the query found in project title and technologies used in project then find cosine_similarity
            between the query and the feature matrix. Return files with minimum distance.
    Step 5: Else
            a) Predict Topic from LDA model get documents with similar topics.
            b) Find cosine_similarity with bert_embeddings with given query.
            c) Sort and aggregate the results and return documents.
    Step 6: Calculate Precision, Recall, F-Score for both scenario to tune the parameters.
    
    
    """
    
    # Step 1
    query = clean(query)
    query_len = len(word_tokenize(query))
    
    if query_len == 0:
        return
    
    
    # Step 2
    result = pd.DataFrame()
    query_data = {
        "project": 0,
        "libs": 0,
        "frameworks": 0,
        "dbs": 0
    }
    
    # Step 3: 
    tf_projects = client.get_dataset('tf_projects')
    tf_libs = client.get_dataset('tf_libs')
    tf_frameworks = client.get_dataset('tf_frameworks')
    tf_dbs = client.get_dataset('tf_dbs')
    
    # Step 4:
    tf_project_query = tf_projects.transform([query]).toarray()
    tf_project_query = np.nonzero(tf_project_query)[1]
    query_data['project'] = len(tf_project_query) / query_len
    
    tf_libs_query = tf_libs.transform([query]).toarray()
    tf_libs_query = np.nonzero(tf_libs_query)[1]
    query_data['libs'] = len(tf_libs_query) / query_len
    if query_data['libs'] > 0:
        tf_libs_query_words = np.array(tf_libs.get_feature_names())[tf_libs_query]
        query = substract_words(query, tf_libs_query_words)    
    
    tf_frameworks_query = tf_frameworks.transform([query]).toarray()
    tf_frameworks_query = np.nonzero(tf_frameworks_query)[1]
    query_data['frameworks'] = len(tf_frameworks_query) / query_len
    if query_data['frameworks'] > 0:
        tf_frameworks_query_words = np.array(tf_frameworks.get_feature_names())[tf_frameworks_query]
        query = substract_words(query, tf_frameworks_query_words)
    
    tf_dbs_query = tf_dbs.transform([query]).toarray()
    tf_dbs_query = np.nonzero(tf_dbs_query)[1]
    query_data['dbs'] = len(tf_dbs_query) / query_len
    if query_data['dbs'] > 0:
        tf_dbs_query_words = np.array(tf_dbs.get_feature_names())[tf_dbs_query]
        query = substract_words(query, tf_dbs_query_words)
    
    # Step 5:
    if query_data['libs'] > 0:
        tf_libs_matrix = client.get_dataset('tf_libs_matrix')
        tf_libs_distance = dd.from_dask_array(da.from_array(euclidean_distances(tf_libs.transform([query]).toarray(), tf_libs_matrix.drop(["project", "file"], axis=1).to_dask_array()).compute().reshape(-1, 1)), columns=["distance"])
        tf_libs_distance['project'] = tf_libs_matrix['project']
        tf_libs_distance['file'] = tf_libs_matrix['file']
        tf_libs_distance = tf_libs_distance.nsmallest(5, "distance").compute()[['project', 'file']]
        result = pd.concat([result, tf_libs_distance])
        
    
    if query_data['frameworks'] > 0:
        tf_frameworks_matrix = client.get_dataset('tf_frameworks_matrix')
        tf_frameworks_distance = dd.from_dask_array(da.from_array(euclidean_distances(tf_frameworks.transform([query]).toarray(), tf_frameworks_matrix.drop(["project", "file"], axis=1).to_dask_array()).compute().reshape(-1, 1)), columns=["distance"])
        tf_frameworks_distance['project'] = tf_frameworks_matrix['project']
        tf_frameworks_distance['file'] = tf_frameworks_matrix['file']
        tf_frameworks_distance = tf_frameworks_distance.nsmallest(5, "distance").compute()[['project', 'file']]
        result = pd.concat([result, tf_frameworks_distance])
        
        
    if query_data['dbs'] > 0:
        tf_dbs_matrix = client.get_dataset('tf_dbs_matrix')
        tf_dbs_distance = dd.from_dask_array(da.from_array(euclidean_distances(tf_projects.transform([query]).toarray(), tf_dbs_matrix.drop(["project", "file"], axis=1).to_dask_array()).compute().reshape(-1, 1)), columns=["distance"])
        tf_dbs_distance['project'] = tf_dbs_matrix['project']
        tf_dbs_distance['file'] = tf_dbs_matrix['file']
        tf_dbs_distance = tf_dbs_distance.nsmallest(5, "distance").compute()[['project', 'file']]
        result = pd.concat([result, tf_dbs_distance])
        
        
    if query_data['project'] > 0:
        tf_project_matrix = client.get_dataset('tf_project_matrix')
        tf_project_distance = dd.from_dask_array(da.from_array(euclidean_distances(tf_projects.transform([query]).toarray(), tf_project_matrix.drop(["project", "file"], axis=1).to_dask_array()).compute().reshape(-1, 1)), columns=["distance"])
        tf_project_distance['project'] = tf_project_matrix['project']
        tf_project_distance['file'] = tf_project_matrix['file']
        tf_project_distance = tf_project_distance.nsmallest(5, "distance").compute()[['project', 'file']]
        result = pd.concat([result, tf_project_distance])
        
        
    if len(word_tokenize(query)) > 0:
        
        
        id2word = client.get_dataset('id2word')
        lda_model = client.get_dataset('lda_model')
        context_based_feature_matrix = client.get_dataset('context_based_feature_matrix')
        
        corpus = id2word.doc2bow(word_tokenize(query))
        topics_distribution = dict(lda_model[corpus][0])
        topic = sorted(topics_distribution.items(), key=lambda x: x[1], reverse=True)[0][0]
        context_based_topic_result = context_based_feature_matrix[context_based_feature_matrix['Dominant_Topic'] == topic]

        query_vec = bert.encode([query])
        context_based_result = dd.from_dask_array(da.from_array(euclidean_distances(query_vec, context_based_topic_result.drop(["project", "file_y", "keyword", "Dominant_Topic", "Topic_Perc_Contrib"], axis=1).to_dask_array()).compute()).reshape(-1, 1), columns=["distance"])
        context_based_result['project'] = context_based_topic_result['project']
        context_based_result['file'] = context_based_topic_result['file_y']
        context_based_result = context_based_result.nsmallest(25, 'distance').groupby(['project', 'file']).agg({'distance': sum}).reset_index().nsmallest(10, 'distance').compute()[['project', 'file']]
        result = pd.concat([result, context_based_result])
    
    result = result.drop_duplicates()
    
    return result

    
    
    
    

In [7]:
%%time
information_retrieval("artificial intelligence and machine learning")

Wall time: 2.98 s


Unnamed: 0,project,file
109,disease prediction using machine learning,./reports_doc/Disease prediction using machine...
389,suspect analysis using machine learning of the of,./reports_doc/Suspect Analysis using Machine L...
360,resume analysis through machine learning,./reports_doc/Resume analysis through machine ...
187,stock price prediction using machine learning,./reports_doc/Group.No_23.pdf
43,android malware detection using machine learning,./reports_doc/Android Malware Detection Using ...
0,audit and compliance in service management usi...,./reports_doc/AUDIT AND COMPLIANCE IN SERVICE_...
