In [1]:
'''
Notebook created by: Gabriele Sottocornola
for the M.Sc. class of Data & Text Mining
'''
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from scipy.stats import entropy

In [2]:
def read_topic_doc_distribution(doc_topic_path):
    '''
    Given the path for the doc-topic distribution file (MALLET output)
    return a pandas Dataframe as a matrix of dimension num-doc x num-topic
    '''
    doc_topic_df = pd.read_csv(doc_topic_path, header=None, sep='\t')
    doc_topic_df.drop([1], axis=1, inplace=True)
    doc_topic_df.rename(columns={0: 'doc_id'}, inplace=True)
    doc_topic_df.set_index(['doc_id'], inplace=True)
    num_topic = len(doc_topic_df.columns)
    doc_topic_df.columns = ['Topic{}'.format(n) for n in range(num_topic)]
    return doc_topic_df

In [3]:
def compute_doc_cosine_similarity(doc_topic_df):
    '''
    Given a Dataframe with the doc-topic distribution
    return a Dataframe with the similarity matrix for each document
    '''
    doc_id = doc_topic_df.index
    doc_sim_matrix = cosine_similarity(doc_topic_df)
    doc_sim_df = pd.DataFrame(doc_sim_matrix, index=doc_id, columns=doc_id)
    return doc_sim_df

In [4]:
def compute_doc_vector_distance(doc_topic_df, distance='cosine'):
    '''
    Function to compute the distance of documents distribution (both cosine and SKL)
    return a Dataframe with the distance matrix for each document
    '''
    doc_id = doc_topic_df.index
    doc_dist_matrix = list()    
    
    for _, doc_distr1 in doc_topic_df.iterrows():
        doc_dist_vector = list()
        for _, doc_distr2 in doc_topic_df.iterrows():
            if distance == 'cosine':
                dist = compute_cosine_distance(doc_distr1, doc_distr2)
            elif distance == 'skl':
                dist = compute_skl_divergence(doc_distr1, doc_distr2)
            doc_dist_vector.append(dist)
        doc_dist_matrix.append(doc_dist_vector)
        
    doc_dist_df = pd.DataFrame(doc_dist_matrix, index=doc_id, columns=doc_id)
    return doc_dist_df

In [5]:
def compute_skl_divergence(distr1, distr2):
    return entropy(distr1, distr2) + entropy(distr2, distr1)

In [6]:
def compute_cosine_distance(distr1, distr2):
    return cosine(distr1, distr2)

In [7]:
def retrieve_most_similar_doc(doc_sim_df, doc_id, k=5, distance=False):
    '''
    Fuction to compute the k most similar documents given a target doc_id and if the metric is a distance
    return a Series indexed by doc_id with the similarity of the k most similar documents
    '''
    doc_sim_col = doc_sim_df[doc_id]
    doc_sim_col_sorted = doc_sim_col.sort_values(ascending=distance)
    return doc_sim_col_sorted[1:k+1]

In [8]:
###############################################################################################################################

In [9]:
#retrieve the n most similar documents given a query document
n = 25
query_doc = 3

doc_topic_path = './data/AssociatedPressDocTopic.txt'
doc_topic_df = read_topic_doc_distribution(doc_topic_path)
doc_sim_df = compute_doc_cosine_similarity(doc_topic_df)

In [10]:
doc_sim_df

doc_id,0,1,2,3,4,5,6,7,8,9,...,2239,2240,2241,2242,2243,2244,2245,2246,2247,2248
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.030275,0.907154,0.472426,0.403891,0.016990,0.257454,0.049126,0.034740,0.290210,...,0.034946,0.599692,0.003134,0.744088,0.762920,0.005655,0.007757,0.452492,0.593070,0.166215
1,0.030275,1.000000,0.049539,0.404860,0.033626,0.772291,0.575766,0.135999,0.147963,0.167399,...,0.212366,0.297373,0.080549,0.028959,0.407246,0.003291,0.477993,0.413533,0.016432,0.110317
2,0.907154,0.049539,1.000000,0.707317,0.731594,0.014316,0.153194,0.012561,0.035594,0.291827,...,0.025316,0.833592,0.011111,0.948083,0.571311,0.003905,0.052056,0.438749,0.758106,0.148487
3,0.472426,0.404860,0.707317,1.000000,0.835247,0.514725,0.384795,0.261936,0.045647,0.255776,...,0.229672,0.932481,0.191519,0.792893,0.351080,0.192273,0.045552,0.362140,0.684288,0.087080
4,0.403891,0.033626,0.731594,0.835247,1.000000,0.053334,0.046699,0.131643,0.031290,0.218686,...,0.157589,0.869977,0.014860,0.874971,0.028183,0.004590,0.065874,0.264938,0.828724,0.071153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2244,0.005655,0.003291,0.003905,0.192273,0.004590,0.002465,0.375295,0.006126,0.019275,0.003634,...,0.002189,0.015662,0.985027,0.003389,0.002901,1.000000,0.128390,0.006499,0.013012,0.003613
2245,0.007757,0.477993,0.052056,0.045552,0.065874,0.084014,0.069759,0.007847,0.031370,0.045380,...,0.460109,0.005132,0.296342,0.004703,0.024920,0.128390,1.000000,0.069606,0.007223,0.004592
2246,0.452492,0.413533,0.438749,0.362140,0.264938,0.036467,0.406782,0.019503,0.087555,0.123224,...,0.006675,0.316059,0.004280,0.363721,0.445546,0.006499,0.069606,1.000000,0.383257,0.120597
2247,0.593070,0.016432,0.758106,0.684288,0.828724,0.009535,0.396953,0.041147,0.055390,0.194027,...,0.006782,0.793746,0.005047,0.799019,0.164987,0.013012,0.007223,0.383257,1.000000,0.114232


In [11]:
doc_topic_df

Unnamed: 0_level_0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.121618,0.000439,0.023312,0.567652,0.038307,0.246365,0.000723,0.000479,0.000456,0.000649
1,0.000630,0.056508,0.000621,0.000547,0.236005,0.000616,0.236225,0.000474,0.071478,0.396897
2,0.046055,0.000479,0.000685,0.425338,0.029411,0.462527,0.029654,0.000523,0.000497,0.004831
3,0.000725,0.000499,0.125444,0.078048,0.060783,0.405002,0.005124,0.095167,0.000519,0.228691
4,0.050524,0.001407,0.111099,0.001775,0.001603,0.777713,0.050801,0.001537,0.001462,0.002080
...,...,...,...,...,...,...,...,...,...,...
2244,0.016490,0.000883,0.001264,0.001114,0.001006,0.001253,0.001455,0.974315,0.000917,0.001305
2245,0.001262,0.000869,0.001245,0.001097,0.060914,0.001234,0.825375,0.105815,0.000903,0.001285
2246,0.110257,0.032030,0.001293,0.172340,0.530194,0.149137,0.001489,0.000987,0.000939,0.001335
2247,0.350973,0.011810,0.001758,0.117946,0.011981,0.499074,0.002024,0.001342,0.001276,0.001816


In [12]:
retrieved_docs_series = retrieve_most_similar_doc(doc_sim_df, query_doc, k=10)
retrieved_docs_series

doc_id
1589    0.947851
1633    0.940213
1491    0.935161
1985    0.932842
2240    0.932481
1134    0.929504
495     0.929400
53      0.927924
899     0.926789
1426    0.921918
Name: 3, dtype: float64

In [13]:
#print out the query document 
with open('./data/AssociatedPress.txt', 'r') as corpus_f:
    corpus_docs = corpus_f.readlines()
corpus_docs[query_doc]

"Today is Saturday, Oct. 29, the 303rd day of 1988. There are 63 days left in the year. A reminder: daylight-saving time ends tomorrow at 2 a.m. local time. Clocks ``fall back'' one hour. Today's highlight in history: In 1929, ``Black Tuesday'' descended upon the New York Stock Exchange. Prices collapsed amid panic selling, thousands of investors were wiped out, and America's Great Depression began. On this date: In 1618, Sir Walter Raleigh, the English courtier, military adventurer and poet, was executed in London. In 1682, Pennsylvania founder William Penn landed at what is now Chester, Pa. In 1901, President William McKinley's assassin, Leon Czolgosz, was electrocuted. In 1911, American newspaperman Joseph Pulitzer died in Charleston, S.C. In 1923, the Republic of Turkey was proclaimed. In 1940, Secretary of War Henry L. Stimson drew the first number _ 158 _ in the first peacetime military draft in U.S. history. In 1947, former first lady Frances Cleveland Preston died in Baltimore 

In [14]:
#print out the retrieved documents
for doc_id in retrieved_docs_series.index:
    print(doc_id)
    print(corpus_docs[doc_id])

1589
Q: Do you think Americans need to be careful in travelling to others places than the Middle East? A: Well, I think they've always _ always been advised to be careful of travel, but I'm not prepared to say nobody should travel to any place in the Middle East. I'm not prepared to say that at all. Q: What about outside the Middle East? A: I'd be careful wherever you go, these days. Q: Speaking of trips, did you have any concern about your own trip to Kennebunkport, that you'll be able to stay on top of things while you're up here _ A: I think we're going to have a safe trip. Are you referring to the safety of the trip? Q: (Off mike.) A: No, I can easily stay here. We have a highly complex and highly efficient communications. I have some of my top advisers here. Others will be coming up there from time to time. I expect to see Jim Baker up there very soon. And I'm in very close _ I will be in very close touch with Pentagon officials or whoever's behind National Security Council right 

In [14]:
##############################################################################################################################

## Take-aways

+ Topic models (i.e. LDA) provide powerful tools to represent documents in a lower dimensional space

+ These functionalities take advantage of MALLET doc-topic distribution output to define similarity between documents

+ Cosine similarity and Symmetrized Kullback-Liebler are useful to compute similarity/distance among probability distributions

+ Similarity/distance measures can be exploited to retrieve similar documents to a query document