In [1]:
import numpy as np
import pandas as pd
import math 
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm

In [2]:
"""
1st vsm to find out top n documents
"""

#cal all vocabulary
with open('query_list.txt') as file:
    query_list = file.read().rstrip().split()
with open('doc_list.txt') as file:
    doc_list = file.read().rstrip().split()

all_voc = []
query_voc = []
doc_voc = []
for query_name in query_list:      #cal query_voc
    with open('Query/' + query_name) as file:
        voc = file.read().replace('-1','').rstrip().split()
        query_voc.extend(voc)

for doc_name in doc_list:      #cal doc_voc
    with open('Document/' + doc_name) as file:
        for line in range(3):  #we don't want first three line data
            file.readline() 
        voc = file.read().replace('-1','').rstrip().split()
        doc_voc.extend(voc)
query_voc = list(set(query_voc))
doc_voc = list(set(doc_voc))

all_voc.extend(query_voc)
all_voc.extend(doc_voc)
all_voc = list(set(all_voc))   #throw the same voc

In [3]:
print(len(all_voc))   #check total voc amout

15884


In [4]:
#cal query term frequency
query_TF = np.zeros((len(all_voc),len(query_list)))

for query_num,query_name in tqdm(enumerate(query_list)):
    with open('Query/' + query_name) as file:
        file = file.read().replace('-1','').rstrip().split()
        for word in range(len(all_voc)):
            if(file.count(all_voc[word])) >0:
                query_TF[word,query_num] = 1 + math.log(file.count(all_voc[word]),2)
            else:
                query_TF[word,query_num] = 0

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
#cal document term frequency and IDF
doc_TF = np.zeros((len(all_voc),len(doc_list)))
voc_IDF = np.zeros((len(all_voc),1))

for doc_num,doc_name in tqdm(enumerate(doc_list)):     #doc TF
    with open('Document/' + doc_name) as file:
        for line in range(3):  #we don't want first three line data
            file.readline() 
        file = file.read().replace('-1','').rstrip().split()
        for word in range(len(all_voc)):
            if(file.count(all_voc[word])) >0:
                doc_TF[word,doc_num] = 1 + math.log(file.count(all_voc[word]),2)
                voc_IDF[word] += 1 
            else:
                doc_TF[word,doc_num] = 0
                
for word in range(len(all_voc)):     #all voc IDF
    if(voc_IDF[word]) >0:
        voc_IDF[word] = math.log(2265/voc_IDF[word],10)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
#query and document TFxIDF
query_TFIDF = np.zeros((len(all_voc),len(query_list)))
doc_TFIDF = np.zeros((len(all_voc),len(doc_list)))

for num in range(len(query_list)):
    for word in range(len(all_voc)):
        query_TFIDF[word,num] = query_TF[word,num]*voc_IDF[word]
        
for num in range(len(doc_list)):
    for word in range(len(all_voc)):
        doc_TFIDF[word,num] = doc_TF[word,num]*voc_IDF[word]

#first time cos() and put the result into pandas
VSM = np.zeros((len(query_list),len(doc_list)))

# for q_num in tqdm(range(len(query_list))):
#     for d_num in range(len(doc_list)):
#         VSM[q_num,d_num] = cosine_similarity([query_TFIDF[:,q_num]],[doc_TFIDF[:,d_num]])
VSM = cosine_similarity(query_TFIDF.T,doc_TFIDF.T)

In [7]:
VSM = pd.DataFrame(VSM,columns = doc_list ,index = query_list)
VSM

Unnamed: 0,VOM19980220.0700.0166,VOM19980220.0700.0221,VOM19980220.0700.0265,VOM19980220.0700.0321,VOM19980220.0700.0359,VOM19980220.0700.0391,VOM19980220.0700.0448,VOM19980220.0700.0487,VOM19980220.0700.0521,VOM19980220.0700.0559,...,VOM19980630.0730.0216,VOM19980630.0730.0248,VOM19980630.0730.0268,VOM19980630.0900.0005,VOM19980630.0900.0040,VOM19980630.0900.0105,VOM19980630.0900.0127,VOM19980630.0900.0169,VOM19980630.0900.0205,VOM19980630.0900.0230
40001.query,0.015965,0.020788,0.019755,0.018740,0.011747,0.021999,0.008370,0.016846,0.012034,0.009696,...,0.002980,0.017864,0.015605,0.017651,0.014700,0.203509,0.036255,0.005826,0.005954,0.004099
40002.query,0.054532,0.023153,0.042833,0.027771,0.026893,0.025857,0.019846,0.012059,0.050379,0.041044,...,0.017248,0.023674,0.040474,0.017445,0.019209,0.006078,0.014073,0.028524,0.017071,0.020499
40003.query,0.013256,0.012178,0.018356,0.020437,0.013348,0.017741,0.006035,0.056990,0.074447,0.032360,...,0.017442,0.036100,0.002343,0.041979,0.018791,0.019572,0.016845,0.013978,0.021231,0.012882
40004.query,0.020712,0.035250,0.020626,0.022167,0.051375,0.017926,0.015182,0.045599,0.032168,0.070980,...,0.022189,0.021765,0.029214,0.265457,0.061783,0.037652,0.025873,0.023767,0.017265,0.035974
40005.query,0.046606,0.016166,0.007924,0.001024,0.007007,0.017110,0.004617,0.017904,0.020345,0.025570,...,0.020378,0.029758,0.000867,0.008788,0.007014,0.001584,0.009000,0.016936,0.010170,0.012552
40006.query,0.015843,0.022660,0.019138,0.015826,0.015044,0.013365,0.010865,0.004801,0.016780,0.019450,...,0.007725,0.026981,0.011995,0.021732,0.012794,0.178855,0.026374,0.009960,0.017188,0.011079
40007.query,0.006650,0.016363,0.029731,0.011224,0.041951,0.017011,0.017191,0.024149,0.024242,0.028988,...,0.023112,0.024634,0.009871,0.080405,0.025987,0.151334,0.011920,0.022403,0.019349,0.040652
40008.query,0.011061,0.025329,0.006052,0.017024,0.017403,0.017188,0.001352,0.033427,0.040226,0.039380,...,0.012547,0.026366,0.000775,0.062586,0.029714,0.018008,0.031138,0.005194,0.012682,0.023602
40009.query,0.017090,0.008271,0.010771,0.007011,0.117744,0.002273,0.002819,0.021557,0.006875,0.017147,...,0.003403,0.010309,0.006031,0.066184,0.024430,0.020647,0.003922,0.011999,0.007478,0.039036
40010.query,0.014840,0.040909,0.016911,0.015941,0.033703,0.004805,0.013881,0.026657,0.020675,0.019655,...,0.019051,0.022746,0.007265,0.039732,0.016394,0.017171,0.011238,0.014992,0.006878,0.010219


In [8]:
top = 20   #the top n document to fix the query
alpha = 0.3
beta = 0.9

q_TFIDF = np.zeros((len(all_voc),len(query_list)))     #new query_TFIDF
q_TFIDF = query_TFIDF


for num in tqdm(range(len(query_list))):   #update query TFIDF
    VSM = VSM.sort_values(by = query_list[num],ascending= False,axis = 1) #sort value
    cols = list(VSM.columns.values) #take the sorted list
    Doc_TFIDF = pd.DataFrame(doc_TFIDF,columns = doc_list,index = all_voc)
    Doc_TFIDF = Doc_TFIDF[cols] #取排序後的col順序
    Doc_TFIDF = Doc_TFIDF.iloc[:,:top] #取前n個doc
    Doc_TFIDF = Doc_TFIDF.values
    rq = np.sum(Doc_TFIDF,axis = 1)
    q_TFIDF[:,num] = alpha*q_TFIDF[:,num] + beta*(rq/top)


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))




In [9]:
#second time cos() and put the result into txt
VSM2 = np.zeros((len(query_list),len(doc_list)))

# for q_num in tqdm(range(len(query_list))):
#     for d_num in range(len(doc_list)):
#         VSM2[q_num,d_num] = cosine_similarity([q_TFIDF[:,q_num]],[doc_TFIDF[:,d_num]])
VSM2 = cosine_similarity(q_TFIDF.T,doc_TFIDF.T)

In [10]:
VSM2 = pd.DataFrame(VSM2,columns = doc_list ,index = query_list)

f = open('rocchio.txt','w')    #write in txt file
f.write('Query,RetrievedDocuments\n')
for i in range(len(query_list)):
    f.write(VSM2.index[i])
    f.write(',')
    VSM2 = VSM2.sort_values(by = query_list[i],ascending= False,axis = 1)
    for j in range(len(doc_list)): #this time evaluate jusy map@50
        f.write(VSM2.columns[j])
        f.write(' ')
    f.write('\n')