In [None]:
'''
Notebook created by: Gabriele Sottocornola
for the M.Sc. class of Data & Text Mining

Task: Build a toy information retrieval system given a list of query terms and 
based on topic modeling representation of target documents
'''

In [None]:
import gensim
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def read_topic_doc_distribution(doc_topic_path):
    doc_topic_df = pd.read_csv(doc_topic_path, header=None, sep='\t')
    doc_topic_df.drop([1], axis=1, inplace=True)
    doc_topic_df.rename(columns={0: 'doc_id'}, inplace=True)
    doc_topic_df.set_index(['doc_id'], inplace=True)
    num_topic = len(doc_topic_df.columns)
    doc_topic_df.columns = ['Topic{}'.format(n) for n in range(num_topic)]
    return doc_topic_df

In [None]:
def get_topic_word_weights(word_weights_path, top_words=50):
    ww_df = pd.read_csv(word_weights_path, sep='\t', header=None)
    ww_df.columns = ['topic_id', 'word', 'weight']
    
    top_ww_df_list = list()
    topic_list = set(ww_df['topic_id'].tolist())
    for topic in topic_list:
        top_ww_df_list.append(ww_df[ww_df['topic_id'] == topic].head(top_words))
    return pd.concat(top_ww_df_list)

In [None]:
def compute_doc_cosine_similarity(doc_topic_df):
    doc_id = doc_topic_df.index
    doc_sim_matrix = cosine_similarity(doc_topic_df)
    doc_sim_df = pd.DataFrame(doc_sim_matrix, index=doc_id, columns=doc_id)
    return doc_sim_df

In [None]:
def generate_topic_query_distribution(top_ww_df, query):
    topic_list = set(top_ww_df['topic_id'].tolist())
    query_topic_dict = {'Topic{}'.format(topic_id): 0.0 for topic_id in topic_list}
    for qt in query:
        query_df = top_ww_df[top_ww_df['word'] == qt]
        print(query_df)
        for _, row in query_df.iterrows():
            query_topic_dict['Topic{}'.format(row['topic_id'])] += row['weight']
    return query_topic_dict

In [None]:
def normalize_topic_query_distribution(query_topic_dict):
    sum_weights = sum(query_topic_dict.values())
    query_topic_dict = {topic: weight / sum_weights for topic, weight in query_topic_dict.items()}
    return query_topic_dict

In [None]:
def retrieve_most_similar_doc(doc_sim_df, doc_id, k=5, distance=False):
    doc_sim_col = doc_sim_df[doc_id]
    doc_sim_col_sorted = doc_sim_col.sort_values(ascending=distance)
    return doc_sim_col_sorted[1:k+1]

In [None]:
################################################################################################################

In [None]:
top_words_weight = 100
query_terms = ['law', 'flight', 'money']
word_weights_path = './mallet_output/AssociatedPress_10topics_wordweights.txt'
doc_topic_path = './mallet_output/AssociatedPress_10topics_doctopics.txt'

In [None]:
top_ww_df = get_topic_word_weights(word_weights_path, top_words=top_words_weight)
top_ww_df

In [None]:
query_topic_dict = normalize_topic_query_distribution(generate_topic_query_distribution(top_ww_df, query_terms))
query_topic_dict

In [None]:
doc_topic_df = read_topic_doc_distribution(doc_topic_path)
query_topic_df = pd.DataFrame(query_topic_dict, index=[-1])
doc_topic_df = doc_topic_df.append(query_topic_df)
doc_sim_df = compute_doc_cosine_similarity(doc_topic_df)

In [None]:
doc_sim_df

In [None]:
retrieved_docs_series = retrieve_most_similar_doc(doc_sim_df, -1)
retrieved_docs_series

In [None]:
with open('./data/AssociatedPress.txt', 'r') as corpus_f:
    corpus_docs = corpus_f.readlines()

for doc_id in retrieved_docs_series.index:
    print(doc_id)
    print(corpus_docs[doc_id])