In [None]:
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm

# Load file

In [None]:
# load doc list
with open('doc_list.txt') as f:
    doc_list = f.read().splitlines()

In [None]:
# load doc from list
docs = {}
words = set()
for doc in tqdm(doc_list):
    with open('docs/' + doc + '.txt') as f:
        data = f.read()
        docs[doc] = data.split()
#         words = words.union(set(docs[doc]))

In [None]:
# load query list
with open('query_list.txt') as f:
    query_list = f.read().splitlines()

In [None]:
# load query from list
queries = {}
for query in tqdm(query_list):
    with open('queries/' + query + '.txt') as f:
        data = f.read()
        queries[query] = data.split()
        words = words.union(set(queries[query]))

# Dictionary save / load

In [None]:
# save words
with open('min_word_list.txt', 'w') as f:
    f.write(' '.join(words))
    
words = list(words)

In [None]:
# load words dict from file
with open('min_word_list.txt') as f:
    words = f.read().split()

In [None]:
print(len(words))

# Calculate docment-tf, query-tf, df, idf

In [None]:
# term frequency in document

tf_docs_list = []

for content in tqdm(docs.values()):
    tf_doc = []
    for word in words:
        tf_doc.append(content.count(word))
    tf_docs_list.append(tf_doc)

In [None]:
tf_docs_npy = np.array(tf_docs_list)
np.save('min_tf_docs_npy', tf_docs_npy)
tf_docs_npy

In [None]:
# document frequency

df_list = []

for word in tqdm(words):
    count = 0
    for content in docs.values():
        if word in content:
            count += 1
    df_list.append(count)

In [None]:
df_npy = np.array(df_list)
np.save('min_df_npy', df_npy)
df_npy

In [None]:
# term frequency in query

tf_queries_list = []

for content in tqdm(queries.values()):
    tf_query = []
    for word in words:
        tf_query.append(content.count(word))
    tf_queries_list.append(tf_query)

In [None]:
tf_queries_npy = np.array(tf_queries_list)
np.save('min_tf_queries_npy', tf_queries_npy)
tf_queries_npy

In [None]:
# inverse document frequency

idf = []
docs_len = len(docs)

for df in tqdm(df_npy):
    idf.append(np.log((docs_len - df + 0.5) / (df + 0.5)))

In [None]:
idf_npy = np.array(idf)
np.save('min_idf_npy', idf_npy)
idf_npy

# Load calculated matrix (save calculate time)

In [None]:
tf_docs_npy = np.load('min_tf_docs_npy.npy')
tf_queries_npy = np.load('min_tf_queries_npy.npy')
df_npy = np.load('min_df_npy.npy')
idf_npy = np.load('min_idf_npy.npy')

In [None]:
print(tf_docs_npy.shape, tf_queries_npy.shape, df_npy.shape, idf_npy.shape)

# BM25 calculate

In [None]:
K1 = 0.28
K3 = 1000
b = 0.85

In [None]:
avg_doclen = 0
for doc in docs.values():
    avg_doclen += len(doc)
avg_doclen /= len(docs)

avg_doclen

In [None]:
queries_result = []

for query_id, query in tqdm(queries.items()):
    query_result = []
    query_index = query_list.index(query_id)
    for doc_name, doc_content in docs.items():
        bm25_weight = 0
        doc_index = doc_list.index(doc_name)
        doc_len = len(doc_content)
        for word in query:
            word_index = words.index(word)
            tf_ij = tf_docs_npy[doc_index][word_index]
            tf_iq = tf_queries_npy[query_index][word_index]
            idf_i = idf_npy[word_index]
            single_term_weight = idf_i * (K1 + 1) * tf_ij / (tf_ij + K1 * ((1 - b) + b * doc_len / avg_doclen)) # * (K3 + 1) * tf_iq / (K3 + tf_iq)
            bm25_weight += single_term_weight
        query_result.append(bm25_weight)
    queries_result.append(query_result)

# sort and export result

In [None]:
sim_df = pd.DataFrame(queries_result)
sim_df = sim_df.transpose()
sim_df.index = doc_list
sim_df.columns = query_list
sim_df

In [None]:
# save results

now = datetime.datetime.now()

save_filename = 'results/result' + '_' + now.strftime("%y%m%d_%H%M") + '.txt'

print(save_filename)

with open(save_filename, 'w') as f:
    f.write('Query,RetrievedDocuments\n')
    for query in query_list:
        f.write(query + ",")
        query_sim_df = sim_df[query].sort_values(ascending=False)
        f.write(' '.join(query_sim_df.index.to_list()) + '\n')