In [3]:
'''
Notebook created by: Gabriele Sottocornola
for the M.Sc. class of Data & Text Mining
'''
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from scipy.stats import entropy

In [4]:
def read_topic_doc_distribution(doc_topic_path):
    '''
    Given the path for the doc-topic distribution file (MALLET output)
    return a pandas Dataframe as a matrix of dimension num-doc x num-topic
    '''
    doc_topic_df = pd.read_csv(doc_topic_path, header=None, sep='\t')
    doc_topic_df.drop([1], axis=1, inplace=True)
    doc_topic_df.rename(columns={0: 'doc_id'}, inplace=True)
    doc_topic_df.set_index(['doc_id'], inplace=True)
    return doc_topic_df

In [5]:
def compute_doc_cosine_similarity(doc_topic_df):
    '''
    Given a Dataframe with the doc-topic distribution
    return a Dataframe with the similarity matrix for each document
    '''
    doc_id = doc_topic_df.index
    doc_sim_matrix = cosine_similarity(doc_topic_df)
    doc_sim_df = pd.DataFrame(doc_sim_matrix, index=doc_id, columns=doc_id)
    return doc_sim_df

In [6]:
def compute_doc_vector_distance(doc_topic_df, distance='cosine'):
    '''
    Function to compute the distance of documents distribution (both cosine and SKL)
    return a Dataframe with the distance matrix for each document
    '''
    doc_id = doc_topic_df.index
    doc_dist_matrix = list()    
    
    for _, doc_distr1 in doc_topic_df.iterrows():
        doc_dist_vector = list()
        for _, doc_distr2 in doc_topic_df.iterrows():
            if distance == 'cosine':
                dist = compute_cosine_distance(doc_distr1, doc_distr2)
            elif distance == 'skl':
                dist = compute_skl_divergence(doc_distr1, doc_distr2)
            doc_dist_vector.append(dist)
        doc_dist_matrix.append(doc_dist_vector)
        
    doc_dist_df = pd.DataFrame(doc_dist_matrix, index=doc_id, columns=doc_id)
    return doc_dist_df

In [7]:
def compute_skl_divergence(distr1, distr2):
    return entropy(distr1, distr2) + entropy(distr2, distr1)

In [8]:
def compute_cosine_distance(distr1, distr2):
    return cosine(distr1, distr2)

In [9]:
def retrieve_most_similar_doc(doc_sim_df, doc_id, k=5, distance=False):
    '''
    Fuction to compute the k most similar documents given a target doc_id and if the metric is a distance
    return a Series indexed by doc_id with the similarity of the k most similar documents
    '''
    doc_sim_col = doc_sim_df[doc_id]
    doc_sim_col_sorted = doc_sim_col.sort_values(ascending=distance)
    return doc_sim_col_sorted[1:k+1]

In [10]:
###############################################################################################################################

In [49]:
#retrieve the n most similar documents given a query document
n = 10
query_doc = 1

doc_topic_path = '.\\data\\AssociatedPressDocTopic.txt'
doc_topic_df = read_topic_doc_distribution(doc_topic_path)
doc_sim_df = compute_doc_cosine_similarity(doc_topic_df)

In [50]:
retrieve_most_similar_doc(doc_sim_df, query_doc, k=10)

doc_id
1412    0.950715
1699    0.942444
2169    0.927611
1615    0.892493
1892    0.884509
1582    0.883745
443     0.882460
216     0.875987
650     0.870607
958     0.868424
Name: 1, dtype: float64

In [51]:
with open('.\\data\\AssociatedPress.txt', 'r') as corpus_f:
    corpus_docs = corpus_f.readlines()
corpus_docs[query_doc]

"The Bechtel Group Inc. offered in 1985 to sell oil to Israel at a discount of at least $650 million for 10 years if it promised not to bomb a proposed Iraqi pipeline, a Foreign Ministry official said Wednesday. But then-Prime Minister Shimon Peres said the offer from Bruce Rappaport, a partner in the San Francisco-based construction and engineering company, was ``unimportant,'' the senior official told The Associated Press. Peres, now foreign minister, never discussed the offer with other government ministers, said the official, who spoke on condition of anonymity. The comments marked the first time Israel has acknowledged any offer was made for assurances not to bomb the planned $1 billion pipeline, which was to have run near Israel's border with Jordan. The pipeline was never built. In San Francisco, Tom Flynn, vice president for public relations for the Bechtel Group, said the company did not make any offer to Peres but that Rappaport, a Swiss financier, made it without Bechtel's k

In [52]:
corpus_docs[1412]

"A Soviet official said Monday that claims against Washington for delays in erecting the Soviet Embassy exceed the $29 million America is demanding for an unfinished, allegedly bugged U.S. Embassy building in Moscow. A Swedish official said Monday that the U.S. claim will go to an arbitration board in Stockholm which has quietly dealt with East-West disputes for 20 years. President Reagan said last Thursday he was recommending tearing down the main office building of the new U.S. Embassy compound in Moscow because the building was riddled with Soviet eavesdropping devices. His administration is not permitting the Soviets to occupy their new building in Washington while the status of the U.S. building remains unsettled. Soviet officials deny the U.S. Embassy building in Moscow is bugged, and on Monday, Foreign Ministry spokesman Gennady I. Gerasimov told a news conference: ``We also have bills to present to the American side.'' Gerasimov said claims for late delivery of construction mat

In [53]:
doc_dist_df = compute_doc_vector_distance(doc_topic_df, distance='skl')
print(retrieve_most_similar_doc(doc_dist_df, query_doc, n, distance=True))

doc_id
1412    0.550400
1143    0.838971
1930    0.867430
1623    0.980738
2169    1.039457
133     1.053972
1277    1.060152
2207    1.097803
1512    1.126759
539     1.128581
Name: 1, dtype: float64


In [16]:
##############################################################################################################################

## Take-aways

+ Topic models (i.e. LDA) provide powerful tools to represent documents in a lower dimensional space

+ These functionalities take advantage of MALLET doc-topic distribution output to define similarity between documents

+ Cosine similarity and Symmetrized Kullback-Liebler are useful to compute similarity/distance among probability distributions

+ Similarity/distance measures can be exploited to retrieve similar documents to a query document