# Importing Libraries

In [11]:
import pickle
import sys
import os
from nltk.stem import PorterStemmer
import re
from rank_bm25 import BM25Okapi
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
import statistics
import math
import operator


# Load Inverted Index and Term Info pickles

In [12]:
with open('Inverted-Index.pickle', 'rb') as handle:
    term_doc = pickle.load(handle)
    
with open('Term-Info.pickle', 'rb') as handle:
    term_info = pickle.load(handle)
    
with open('Docs-Words.pickle', 'rb') as handle:
    docs_words = pickle.load(handle)    

with open('Doc-Info.pickle', 'rb') as handle:
    docs = pickle.load(handle)     

# Inverted Index

In [13]:
inv_index = {}
for word in term_doc.keys():
    inv_index[word] = term_doc[word].keys()   

# Load doc_ids and stopwords

In [14]:
doc_ids = {}
with open("docids.txt") as f:
    for line in f:
        key, val = line.split()
        doc_ids[int(key)] = val
        
f = open("stoplist.txt", "r") 
stoplist = f.read().splitlines() #Stoplist words        

# Load Queries

In [28]:
queries = {}
i = 0;
f = open("topics.xml", "r")
for line in f:
    result = re.search("<query>(.*)</query>", line)
    if result:
        words = result.group(1).split()
        queries[i] = words
        i = i + 1 
print(queries)       

{0: ['uss', 'carl', 'vinson'], 1: ['capital', 'gains', 'tax', 'rate'], 2: ['nicolas', 'cage', 'movies'], 3: ['electoral', 'college', '2008', 'results'], 4: ['i', 'will', 'survive', 'lyrics'], 5: ["world's", 'biggest', 'dog'], 6: ['dark', 'chocolate', 'health', 'benefits'], 7: ['afghanistan', 'flag'], 8: ['civil', 'war', 'battles', 'in', 'South', 'Carolina'], 9: ['ford', 'edge', 'problems']}


# Processing the queries

In [29]:
ps = PorterStemmer()
for key in queries.keys():
    words = []
    for word in queries[key]:
        word = word.lower()
        if word not in stoplist:  
            word = word.replace("'", "")   
            word = ps.stem(word) 
            words.append(word)
    queries[key] = words 

# Average doc length in the corpus
doc_lens = []
for d in docs_words.keys():
    doc_lens.append(len(docs_words[d])) 

# Calculating BM25 Scores

In [34]:
def bm25_score(d, q):
    D = 3495 # total number of docs
    b = 0.75
    k1 = 1.2
    k2 = 500
    avg_doc_len = statistics.mean(doc_lens)
    doc_len = len(docs_words[d])
    K = k1 * ((1 - b) + b * (doc_len / avg_doc_len))
    
    i_score = []
    for i in q:
        _, _, df_i = term_info[i]
        tf_doc = 0
        for value in docs_words[d]:
            value = value.lower()
            value = value.replace("'","")
            value = ps.stem(value)
            if value == i:
                tf_doc = tf_doc + 1
        tf_q = q.count(i) 
        
        x = (D + 0.5) / (df_i + 0.5)
        x = math.log(x, 10)
        y = (1 + k1) * (tf_doc) / (K + tf_doc)
        z = (1 + k2) * (tf_q) / (k2 + tf_q)
        ans = x * y * z
        i_score.append(ans)
        
    return sum(i_score)   

# Sorting Scores

In [38]:
topics_scores = {}
for q in queries:
    scores = {}
    for i in range(3495):
        if bm25_score(i, queries[q]) > 0.0:
            scores[i] = bm25_score(i, queries[q]) 
    sorted_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
    topics_scores[q] = sorted_scores    

# Saving Scores as pickle

In [39]:
with open('BM25_Scores.pickle', 'wb') as handle:
    pickle.dump(topics_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)   

# Printing the Results

In [40]:
with open('BM25_Scores.pickle', 'rb') as handle:
    topics_scores = pickle.load(handle)

for q in topics_scores.keys():
    rank = 0
    for key in topics_scores[q]:
        print(str(q) + "\t" + str(docs[topics_scores[q][rank][0]]) + "\t" + str(rank + 1) + "\t" + str(topics_scores[q][rank][1]) + "\t" + "run 1")
        rank = rank + 1
    print("\n")    

0	clueweb12-0010wb-36-02425	1	7.547699386119612	run 1
0	clueweb12-0602wb-40-19720	2	7.482989985892983	run 1
0	clueweb12-0602wb-37-16519	3	7.481917128791678	run 1
0	clueweb12-0602wb-37-16520	4	7.481917128791678	run 1
0	clueweb12-0602wb-40-19722	5	7.481917128791678	run 1
0	clueweb12-0602wb-40-19725	6	7.481917128791678	run 1
0	clueweb12-0602wb-40-19726	7	7.481917128791678	run 1
0	clueweb12-0602wb-45-15703	8	7.481917128791678	run 1
0	clueweb12-0602wb-45-15704	9	7.481917128791678	run 1
0	clueweb12-0602wb-40-19721	10	7.480844579283288	run 1
0	clueweb12-0602wb-40-19723	11	7.480844579283288	run 1
0	clueweb12-0602wb-40-19724	12	7.480844579283288	run 1
0	clueweb12-0410wb-82-17853	13	7.420692598807424	run 1
0	clueweb12-0010wb-21-24657	14	7.392898280349998	run 1
0	clueweb12-0305wb-62-31165	15	7.386048834743766	run 1
0	clueweb12-1118wb-60-01282	16	7.368064404014849	run 1
0	clueweb12-0602wb-34-08563	17	7.3479962401196754	run 1
0	clueweb12-1304wb-62-13500	18	7.340762439275105	run 1
0	clueweb12-0407wb

1	clueweb12-1207wb-52-22989	967	0.8643732181534459	run 1
1	clueweb12-0009wb-79-28334	968	0.8640044188969856	run 1
1	clueweb12-0406wb-22-34283	969	0.8623914831986644	run 1
1	clueweb12-0406wb-18-37343	970	0.8622382097536401	run 1
1	clueweb12-0000wb-27-14207	971	0.858874727859377	run 1
1	clueweb12-0000wb-27-14207.txt	972	0.858874727859377	run 1
1	clueweb12-0500tw-06-04796	973	0.857447549418327	run 1
1	clueweb12-1200tw-49-00399	974	0.8570067178788023	run 1
1	clueweb12-1200wb-78-24430	975	0.8558340788105239	run 1
1	clueweb12-1202wb-86-14663	976	0.8558340788105239	run 1
1	clueweb12-1202wb-97-15939	977	0.8558340788105239	run 1
1	clueweb12-1203wb-09-02995	978	0.8558340788105239	run 1
1	clueweb12-1203wb-96-19820	979	0.8558340788105239	run 1
1	clueweb12-1906wb-06-11966	980	0.8554454062494291	run 1
1	clueweb12-1804wb-88-09531	981	0.8553414213891406	run 1
1	clueweb12-1003wb-98-11520	982	0.8552469814690271	run 1
1	clueweb12-0807wb-33-21698	983	0.8541203387400081	run 1
1	clueweb12-0102wb-93-02466	98

3	clueweb12-0000wb-27-15113	419	1.8456519374728177	run 1
3	clueweb12-0000wb-27-15113.txt	420	1.8456519374728177	run 1
3	clueweb12-0800wb-29-03800	421	1.8444714144683583	run 1
3	clueweb12-1019wb-79-00865	422	1.8443189870531396	run 1
3	clueweb12-0105wb-80-30821	423	1.8437184357719498	run 1
3	clueweb12-0903wb-08-05023	424	1.8416178693650722	run 1
3	clueweb12-1609wb-37-05543	425	1.8413173691175126	run 1
3	clueweb12-1012wb-47-04679	426	1.8409689336221158	run 1
3	clueweb12-0815wb-63-08988	427	1.8395842808886984	run 1
3	clueweb12-0906wb-64-15878	428	1.8393478535027101	run 1
3	clueweb12-0705wb-39-02251	429	1.8377690036523173	run 1
3	clueweb12-0900tw-61-21014	430	1.8352241547455066	run 1
3	clueweb12-0612wb-68-05029	431	1.827646393513387	run 1
3	clueweb12-0110wb-34-12523	432	1.824262420904226	run 1
3	clueweb12-1913wb-04-15103	433	1.824067186361595	run 1
3	clueweb12-0012wb-15-12287	434	1.8158956437072846	run 1
3	clueweb12-0815wb-87-20872	435	1.8151281881998256	run 1
3	clueweb12-0701wb-76-01669	43

5	clueweb12-1713wb-61-31957	337	1.7622350040188635	run 1
5	clueweb12-0510wb-01-02110	338	1.7612051847820545	run 1
5	clueweb12-0300tw-94-14126	339	1.751672300181084	run 1
5	clueweb12-1709wb-91-29048	340	1.727360932932484	run 1
5	clueweb12-1800tw-22-04873	341	1.7221932226696612	run 1
5	clueweb12-0403wb-81-08973	342	1.721766978842739	run 1
5	clueweb12-1115wb-62-18179	343	1.7215208039142071	run 1
5	clueweb12-0200tw-04-17324	344	1.6944904540462782	run 1
5	clueweb12-0210wb-50-14238	345	1.6844080974885571	run 1
5	clueweb12-1117wb-13-14507	346	1.67048724538296	run 1
5	clueweb12-0800tw-20-15539	347	1.6557736363311844	run 1
5	clueweb12-0816wb-22-26746	348	1.654013611008648	run 1
5	clueweb12-1911wb-02-24306	349	1.6537672736529345	run 1
5	clueweb12-0100wb-98-08558	350	1.6369605967640002	run 1
5	clueweb12-1907wb-86-19405	351	1.6343733824722508	run 1
5	clueweb12-0900tw-28-13656	352	1.6298546356136399	run 1
5	clueweb12-0800tw-49-07482	353	1.6242987930816377	run 1
5	clueweb12-1307wb-40-17809	354	1.623

6	clueweb12-1717wb-39-00682	343	1.889432931650118	run 1
6	clueweb12-1804wb-62-01680	344	1.8855185471969316	run 1
6	clueweb12-0508wb-23-02118	345	1.8815515294662946	run 1
6	clueweb12-1410wb-71-08285	346	1.8814560701892522	run 1
6	clueweb12-1400tw-18-10019	347	1.8687537302995807	run 1
6	clueweb12-0800tw-20-15539	348	1.8673732713521471	run 1
6	clueweb12-0402wb-17-30671	349	1.8642896117936307	run 1
6	clueweb12-1400tw-16-11230	350	1.8638314179222877	run 1
6	clueweb12-1002wb-28-10594	351	1.8537050877544483	run 1
6	clueweb12-1503wb-77-19623	352	1.8519955322664776	run 1
6	clueweb12-1700tw-26-19458	353	1.8503347503158385	run 1
6	clueweb12-0900tw-23-04411	354	1.8499626325717107	run 1
6	clueweb12-1316wb-42-03922	355	1.8413823112189835	run 1
6	clueweb12-0911wb-16-20604	356	1.839568744715705	run 1
6	clueweb12-0817wb-12-13340	357	1.8157455420815096	run 1
6	clueweb12-0100tw-41-19493	358	1.8041231397764503	run 1
6	clueweb12-1900tw-32-20698	359	1.8007118423175195	run 1
6	clueweb12-0312wb-10-13465	360	1

8	clueweb12-0405wb-54-18649	3	7.968203520946427	run 1
8	clueweb12-0406wb-57-31747	4	7.952818690116739	run 1
8	clueweb12-0406wb-97-04957	5	7.943787248096973	run 1
8	clueweb12-0406wb-97-04958	6	7.931923285575236	run 1
8	clueweb12-0100wb-12-21939	7	7.925106276635657	run 1
8	clueweb12-0013wb-62-26394	8	7.907190447115057	run 1
8	clueweb12-0406wb-91-24778	9	7.902152256587367	run 1
8	clueweb12-0009wb-79-28336	10	7.893007565980328	run 1
8	clueweb12-0013wb-82-24249	11	7.889827017854213	run 1
8	clueweb12-0013wb-75-12593	12	7.887022718669255	run 1
8	clueweb12-0406wb-30-19254	13	7.885943862732746	run 1
8	clueweb12-0009wb-79-28331	14	7.884687328418032	run 1
8	clueweb12-0406wb-87-34052	15	7.884132866146352	run 1
8	clueweb12-0009wb-79-28334	16	7.873688545976542	run 1
8	clueweb12-0406wb-77-18401	17	7.867433483438184	run 1
8	clueweb12-0010wb-31-29702	18	7.856363544102556	run 1
8	clueweb12-0003wb-12-28508	19	7.85602131201299	run 1
8	clueweb12-0010wb-17-14863	20	7.854176192204436	run 1
8	clueweb12-0502wb

9	clueweb12-0104wb-68-33874	25	4.713414595686291	run 1
9	clueweb12-0012wb-11-29675	26	4.708993452138314	run 1
9	clueweb12-0914wb-17-13650	27	4.704064175639879	run 1
9	clueweb12-0500wb-13-32760	28	4.691470780386276	run 1
9	clueweb12-1400wb-79-28607	29	4.67742279356704	run 1
9	clueweb12-0104wb-79-05346	30	4.677301355786123	run 1
9	clueweb12-1907wb-71-03386	31	4.67512297083254	run 1
9	clueweb12-0002wb-47-34079	32	4.67496069764344	run 1
9	clueweb12-0700tw-14-05689	33	4.639136693736373	run 1
9	clueweb12-1602wb-06-13049	34	4.627518395960797	run 1
9	clueweb12-0906wb-64-15878	35	4.624517600086456	run 1
9	clueweb12-0712wb-10-08622	36	4.611182624729868	run 1
9	clueweb12-1810wb-24-22652	37	4.6012772058769045	run 1
9	clueweb12-0611wb-48-04578	38	4.600666316907349	run 1
9	clueweb12-0501wb-63-09360	39	4.597559613872921	run 1
9	clueweb12-1108wb-51-10234	40	4.566699330950383	run 1
9	clueweb12-1413wb-73-26852	41	4.553206255574658	run 1
9	clueweb12-0705wb-39-02251	42	4.5503984776070565	run 1
9	clueweb12

# Calculating No of total words

In [44]:
total_words = 0
for key in term_info.keys():
    _, ocr, _ = term_info[key]
    total_words = total_words + ocr    

# Dirichlet Smoothing

In [47]:
def dirichlet_score(d, q):
    mu = statistics.mean(doc_lens)
    N = len(docs_words[d])

    i_score = 1
    for i in q:
        tf_doc = 0
        for value in docs_words[d]:
            value = value.lower()
            value = value.replace("'","")
            value = ps.stem(value)
            if value == i:
                tf_doc = tf_doc + 1
        f_doc = len(docs_words[d])
        prob_d = 0
        if f_doc != 0:
            prob_d = tf_doc / f_doc
        vocab = 198244
        _, tf_c, _ = term_info[i]
        prob_c = tf_c / total_words
        x = N / (N + mu) * prob_d
        y = mu / (N + mu) * prob_c
        ans = x + y
        if f_doc < 2:
            ans = 0
        i_score = i_score * ans
    return i_score    

# Sorting the Scores

In [48]:
topics_scores = {}
for q in queries:
    scores = {}
    for i in range(3495):
        scores[i] = dirichlet_score(i, queries[q]) 
    sorted_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
    topics_scores[q] = sorted_scores

# Saving Scores as pickle

In [49]:
with open('Dirichlet_Scores.pickle', 'wb') as handle:
    pickle.dump(topics_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)  

# Printing the results

In [54]:
with open('Dirichlet_Scores.pickle', 'rb') as handle:
    topics_scores = pickle.load(handle)

for q in topics_scores.keys():
    rank = 0
    for key in range(100):
        print(str(q) + "\t" + str(docs[topics_scores[q][rank][0]]) + "\t" + str(rank + 1) + "\t" + str(topics_scores[q][rank][1]) + "\t" + "run 1")
        rank = rank + 1
    print("\n") 

0	clueweb12-0010wb-21-24657	1	2.8292947726494674e-06	run 1
0	clueweb12-0010wb-36-02425	2	1.1142132286281708e-06	run 1
0	clueweb12-0305wb-62-31165	3	4.751838676014495e-07	run 1
0	clueweb12-1118wb-60-01282	4	3.649716326008142e-07	run 1
0	clueweb12-0602wb-40-19720	5	3.204419196710819e-07	run 1
0	clueweb12-0602wb-37-16519	6	3.178718064502762e-07	run 1
0	clueweb12-0602wb-37-16520	7	3.178718064502762e-07	run 1
0	clueweb12-0602wb-40-19722	8	3.178718064502762e-07	run 1
0	clueweb12-0602wb-40-19725	9	3.178718064502762e-07	run 1
0	clueweb12-0602wb-40-19726	10	3.178718064502762e-07	run 1
0	clueweb12-0602wb-45-15703	11	3.178718064502762e-07	run 1
0	clueweb12-0602wb-45-15704	12	3.178718064502762e-07	run 1
0	clueweb12-0602wb-40-19721	13	3.153291047967619e-07	run 1
0	clueweb12-0602wb-40-19723	14	3.153291047967619e-07	run 1
0	clueweb12-0602wb-40-19724	15	3.153291047967619e-07	run 1
0	clueweb12-0602wb-34-08563	16	2.946799126512459e-07	run 1
0	clueweb12-0410wb-82-17853	17	2.8475855932428644e-07	run 1
0	c