In [43]:
import re
import math
import pandas as pd

# Reading Indexing Files

In [44]:
with open("docids.txt", "r") as f:
    D = 0
    for line in f:
        if line.strip():
            D += 1

doc_dict = {}
len_D = 0.0
with open("doc_index.txt", 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip().split()
        len_D += len(line[2:])
        if int(line[0]) in doc_dict:
            doc_dict[int(line[0])][int(line[1])] = line[2:]
        else:
            doc_dict[int(line[0])] = {int(line[1]): line[2:]}
avgD = float(len_D/D)

with open("docids.txt", 'r', encoding='utf-8') as f:
    doc_ids = [line.strip().split()[0] for line in f]

term_dict = {}
with open("termids.txt", 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()
            term_dict[line[1]] = line[0]

info_dict = {}
with open("term_info.txt", 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()
            info_dict[line[0]] = (line[1], line[2], line[3])

# Functions for Calculating Relevance Scores

In [45]:
def TF_IDF(tf, df):
  idf = math.log(D / int(df)) 
  return float(tf*idf)

def DirichletSmoothing(tf, ttf, length):
  tf = float(tf)
  length = float(length)
  ttf = float(ttf)
  score = ((length/(length+avgD))*(tf/length))+((avgD/(length+avgD))*(ttf/len_D))
  return score

def OkapiBM25(tf, df, length):
  tf = float(tf)
  df = float(df)
  k1 = 1.2
  k2 = 1000
  b = 0.75
  K = k1*((1-b)+(b*float(length/avgD)))
  score = (math.log((D+0.5)/(df+0.5)))*(((1+k1)*tf)/(K+tf))*(((1+k2)*tf)/(k2+tf))
  return score

In [46]:
def getTermID(term):
    if term in term_dict:
      return term_dict[term]
    else:
      return None

def DocLength(doc_id):
    length = sum([len(posting) for term_id, posting in doc_dict[int(doc_id)].items()])
    return length

def TermInfo(term_id):
    return info_dict[term_id]

def Term_Freq(offset, term, doc):
    with open('term_index.txt', 'r', encoding='utf-8') as f:
        f.seek(int(offset))
        line = f.readline()
        line = line.strip().split()
        prev_doc=0
        tf = 0
        if line[0]==term:
            for x in line[1:]:
                doc_id, position = x.split(":")
                doc_id=int(doc_id)+prev_doc
                prev_doc=doc_id
                if doc_id==int(doc):
                    tf += 1
                elif doc_id>int(doc):
                    break
        return tf

def Normalize(num):
  if num==0:
    return num
  else:
    return 1 + math.log(num)

def score(queries, method):
    scores = {}
    for i, query in enumerate(queries):
        scores[i+1] = {} 
        for doc_id in doc_ids:
            if method == 'Dirichlet Smoothing':
                score = 1
            else:
                score = 0
            length = DocLength(doc_id)
            for word in query:
                term_id = getTermID(word)
                idf = 0
                if term_id:
                    offset, ttf, df = TermInfo(term_id)
                    tf = Term_Freq(offset, term_id, doc_id)
                    norm_tf = Normalize(tf)
                    if method == 'TF-IDF':
                        score += TF_IDF(norm_tf, df)
                    elif method == 'Okapi BM25':
                        score += OkapiBM25(norm_tf, df, length)
                    elif method == 'Dirichlet Smoothing':
                        score *= DirichletSmoothing(tf, ttf, length)
            scores[i+1][doc_id] = score
    return scores

In [47]:
stop_words = set(open('Urdu stopwords.txt', 'r', encoding='utf-8').read().splitlines())
queries_num = 0
with open("queries.txt", "r", encoding='utf-8') as f:
    queries = []
    for line in f:
        line = re.sub(r'[^\w\s؀-ۿ]', '', line)
        tokens = line.split()
        words = [word for word in tokens if word not in stop_words]
        queries.append(words)
        queries_num += 1

method = input("Enter Scoring Method: ")
score_dict = score(queries, method)

In [48]:
Ranked_Docs = [[]] * queries_num

with open('score_DirichletSmoothing.txt', 'w') as f:
  for q_id, doc_score in score_dict.items():
    data = sorted(doc_score, key=doc_score.get, reverse=True)
    rank = 0
    for id in data:
      rank += 1
      f.write(f"{q_id}\t{id}\t{rank}\t{score_dict[q_id][id]}\n")
    Ranked_Docs[q_id-1] = data

# Precision @ k

In [49]:
dfs = pd.read_excel('qrels.xlsx', sheet_name=None)

result = pd.DataFrame(columns=['Topic Id', 'Doc Id', 'Doc Relevancy'])

for sheet_name, df in dfs.items():
    selected_columns = df.iloc[:, [1, 2, 3]]
    result = pd.concat([result, selected_columns], axis=0)

result = result.reset_index(drop=True)

In [50]:
Actual_Ranks = {}
for _, row in result.iterrows():
    topic_id = row['Topic Id']
    doc_id = row['Doc Id']
    doc_relevancy = row['Doc Relevancy']
    if topic_id not in Actual_Ranks:
        Actual_Ranks[topic_id] = {}
    Actual_Ranks[topic_id][doc_id] = 1 if doc_relevancy > 2 else 0

In [60]:
def precision_at_k(k):
  pre=[]
  print("Precision @ ",k)
  i=0
  for rank_list in Ranked_Docs:
    precision = 0.0
    i += 1
    for doc_id in rank_list[:k]:
      if int(doc_id) in Actual_Ranks[i] and Actual_Ranks[i][int(doc_id)] == 1:
        precision += 1
    pre.append(precision/k)
  print(pre)

precision_at_k(5)
precision_at_k(10)
precision_at_k(20)
precision_at_k(30)

Precision @  5
[0.2, 0.0, 0.0, 0.6, 0.2, 0.4, 0.0, 0.0, 0.6, 0.0]
Precision @  10
[0.1, 0.0, 0.0, 0.3, 0.1, 0.2, 0.0, 0.0, 0.3, 0.0]
Precision @  20
[0.05, 0.0, 0.0, 0.15, 0.05, 0.1, 0.0, 0.0, 0.15, 0.0]
Precision @  30
[0.03333333333333333, 0.0, 0.0, 0.1, 0.03333333333333333, 0.06666666666666667, 0.0, 0.0, 0.1, 0.0]


# Mean Average Precision

In [53]:
def mean_average_precision(num_docs, output_file):
    avg_precision = 0.0
    num_queries = len(Ranked_Docs)
    
    with open(output_file, 'w') as f:
        for i, rank_list in enumerate(Ranked_Docs, 1):
            precision_sum = 0.0
            relevant_docs = {k:v for k,v in Actual_Ranks[i].items() if v==1}
            num_relevant = len(relevant_docs)

            if num_relevant == 0:
                continue

            for j in range(num_docs):
                doc_id = rank_list[j]
                if int(doc_id) in relevant_docs:
                    if relevant_docs[int(doc_id)] == 1:
                        precision_sum += 1.0
                        avg_precision += precision_sum / (j+1)
            avg_precision /= num_relevant
            f.write("Query "+str(i)+": "+str(avg_precision) + '\n')

mean_average_precision(500,"DirichletSmoothing_MAP.txt")