In [1]:
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

# Load file

In [2]:
# load doc list
with open('doc_list.txt') as f:
    doc_list = f.read().splitlines()

In [3]:
# load doc from list
docs = {}
words = set()
for doc in tqdm(doc_list):
    with open('docs/' + doc + '.txt') as f:
        data = f.read()
        docs[doc] = data.split()
#         words = words.union(set(docs[doc]))

100%|██████████| 4191/4191 [00:01<00:00, 3564.82it/s]


In [4]:
# load query list
with open('query_list.txt') as f:
    query_list = f.read().splitlines()

In [5]:
# load query from list
queries = {}
for query in tqdm(query_list):
    with open('queries/' + query + '.txt') as f:
        data = f.read()
        queries[query] = data.split()
        words = words.union(set(queries[query]))

100%|██████████| 50/50 [00:00<00:00, 3034.91it/s]


# Dictionary save / load

In [6]:
# save words
with open('min_word_list.txt', 'w') as f:
    f.write(' '.join(words))
    
words = list(words)

In [7]:
# load words dict from file
with open('min_word_list.txt') as f:
    words = f.read().split()

In [8]:
print(len(words))

123


# Calculate docment-tf, query-tf, df, idf

In [9]:
# term frequency in document

tf_docs_list = []

for content in tqdm(docs.values()):
    tf_doc = []
    for word in words:
        tf_doc.append(content.count(word))
    tf_docs_list.append(tf_doc)

100%|██████████| 4191/4191 [00:04<00:00, 940.48it/s] 


In [10]:
tf_docs_npy = np.array(tf_docs_list)
np.save('min_tf_docs_npy', tf_docs_npy)
tf_docs_npy

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
# document frequency

df_list = []

for word in tqdm(words):
    count = 0
    for content in docs.values():
        if word in content:
            count += 1
    df_list.append(count)

100%|██████████| 123/123 [00:04<00:00, 29.80it/s]


In [12]:
df_npy = np.array(df_list)
np.save('min_df_npy', df_npy)
df_npy

array([ 419,  443,   13,  738,  302,  213,  249,  554,  174, 1376,  114,
          7,  282,  290,  380,   73,  513,   79,   15,   27,  585,   86,
        942,   33,  827,  303,  226,   14,   22,   29,  463,  134,  235,
        244, 1715,  176,  274,  112,   57,  814,    1,  210,  443,   47,
        408,   26,  122,   75,   76,  254,   12,   12,  160,   86,   21,
        440,  346,  163,  161,   12,    0,  181,   71,  271,   25,  802,
        306,  124,  310,  698,   23,    5,  402,  108,    7,  261,  151,
        470,  461,  878,  370,   24,   33,  136,    5,  333,  191,  762,
        221,   58,   48,   76,  755,  160,  545,   34,   44,  377, 1840,
        149,  408, 1574,   32,  541,  138,  517,  168,   10,  482,  259,
         12,  540,  288,  118,   71,  981,   82,  141,  590,  346,  145,
          4,    3])

In [13]:
# term frequency in query

tf_queries_list = []

for content in tqdm(queries.values()):
    tf_query = []
    for word in words:
        tf_query.append(content.count(word))
    tf_queries_list.append(tf_query)

100%|██████████| 50/50 [00:00<00:00, 21430.12it/s]


In [14]:
tf_queries_npy = np.array(tf_queries_list)
np.save('min_tf_queries_npy', tf_queries_npy)
tf_queries_npy

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
# inverse document frequency

idf = []
docs_len = len(docs)

for df in tqdm(df_npy):
    idf.append(np.log(1 + (1 + docs_len) / (1 + df)))

100%|██████████| 123/123 [00:00<00:00, 177773.74it/s]


In [16]:
idf_npy = np.array(idf)
np.save('min_idf_npy', idf_npy)
idf_npy

array([2.39616217, 2.34578264, 5.70521003, 1.89799917, 2.69698814,
       3.02474651, 2.87739909, 2.1463    , 3.21704558, 1.39730828,
       3.62306476, 6.26339826, 2.76081473, 2.73472448, 2.48512535,
       4.05436681, 2.2143703 , 3.97781075, 5.57215403, 5.0153859 ,
       2.09845754, 3.8955665 , 1.69476884, 4.82265067, 1.80217206,
       2.69391569, 2.96871869, 5.63645488, 5.21091065, 4.94686685,
       2.30602744, 3.46735496, 2.93187149, 2.89647554, 1.23631136,
       3.2061397 , 2.78770122, 3.6401446 , 4.29423125, 1.8154041 ,
       7.64826303, 3.03818327, 2.34578264, 4.48111754, 2.42031379,
       5.05151655, 3.55766825, 4.02816727, 4.01532946, 2.85872145,
       5.77908021, 5.77908021, 3.29721618, 3.8955665 , 5.25512514,
       2.35191501, 2.57113722, 3.27944306, 3.29125391, 5.77908021,
       8.34117175, 3.17942651, 4.08129685, 2.79799844, 5.08901982,
       1.82783798, 2.68476268, 3.54200226, 2.67270621, 1.94550132,
       5.16858826, 6.55060403, 2.43378747, 3.67525495, 6.26339

# Load calculated matrix (save calculate time)

In [17]:
tf_docs_npy = np.load('min_tf_docs_npy.npy')
tf_queries_npy = np.load('min_tf_queries_npy.npy')
df_npy = np.load('min_df_npy.npy')
idf_npy = np.load('min_idf_npy.npy')

In [18]:
print(tf_docs_npy.shape, tf_queries_npy.shape, df_npy.shape, idf_npy.shape)

(4191, 123) (50, 123) (123,) (123,)


# Calculate TF-IDF(doc, query)

In [19]:
# term weight (tf i,j)*log(1 + (1+N)/(1+ni))

In [20]:
tf_idf_docs = tf_docs_npy * idf_npy
print(tf_idf_docs.shape)
tf_idf_docs

(4191, 123)


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 2.39616217,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [11.98081085,  4.69156528,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [21]:
tf_idf_queries = tf_queries_npy * idf_npy
print(tf_idf_queries.shape)
tf_idf_queries

(50, 123)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# cosine similarity (doc, query)

In [22]:
cosine_npy = cosine_similarity(tf_idf_docs, tf_idf_queries)
cosine_npy

array([[0.2952988 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0896808 , 0.        , 0.14697496, ..., 0.        , 0.        ,
        0.05630288],
       [0.02882332, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.01639428, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06107356, 0.02859645, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [23]:
cosine_npy.shape

(4191, 50)

# sort and export result

In [24]:
sim_df = pd.DataFrame(cosine_npy)
sim_df.index = doc_list
sim_df.columns = query_list
sim_df

Unnamed: 0,301,302,303,304,305,306,307,308,309,310,...,341,342,343,344,345,346,347,348,349,350
FBIS3-10082,0.295299,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.158497,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
FBIS3-10231,0.089681,0.000000,0.146975,0.000000,0.096830,0.000000,0.106035,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.300916,0.0,0.0,0.0,0.056303
FBIS3-10243,0.028823,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
FBIS3-10285,0.031982,0.000000,0.020093,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.062294,0.000000,0.600987,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
FBIS3-10291,0.192518,0.007943,0.073936,0.000000,0.049501,0.000000,0.333560,0.0,0.027315,0.009868,...,0.079348,0.013972,0.074425,0.0,0.000000,0.200773,0.0,0.0,0.0,0.043574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA123090-0026,0.137243,0.333274,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.132122
LA123189-0136,0.000000,0.000000,0.000000,0.788064,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
LA123190-0040,0.016394,0.000000,0.000000,0.000000,0.072423,0.072053,0.009517,0.0,0.008799,0.049235,...,0.025561,0.000000,0.444451,0.0,0.008083,0.028982,0.0,0.0,0.0,0.000000
LA123190-0062,0.000000,0.000000,0.000000,0.000000,0.000000,0.398024,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.435956,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000


In [25]:
# save results

now = datetime.datetime.now()

save_filename = 'results/result' + '_' + now.strftime("%y%m%d_%H%M") + '.txt'

print(save_filename)

with open(save_filename, 'w') as f:
    f.write('Query,RetrievedDocuments\n')
    for query in query_list:
        f.write(query + ",")
        query_sim_df = sim_df[query].sort_values(ascending=False)
        f.write(' '.join(query_sim_df.index.to_list()) + '\n')

results/result_201025_1442.txt
