In [1]:
# Dependencies
!pip install pandas
import pandas as pd
import numpy as np
import glob

zsh:1: /opt/homebrew/bin/pip: bad interpreter: /opt/homebrew/opt/python@3.9/bin/python3.9: no such file or directory


In [2]:
# Load the qrels file
qrels_file_path = '../qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'is_relevant'])

In [3]:
# Load input files
input_files = glob.glob('../inputs/*.csv')

# Create an empty list to store the input files dataframes
input_dfs = []

# Loop through each input CSV file
for filename in input_files:
    # Read the input CSV file into a dataframe
    df = pd.read_csv(filename, sep='\t', header=None)
    # Append the dataframe to the list of dataframes
    input_dfs.append(df)
    
raw_data = pd.concat(input_dfs, ignore_index=True)

In [4]:
# Name input dataframe column name
columns = ['query_id', 'ignore', 'doc_id', 'rank', 'score', 'system_name']

# Create an empty list to store the cleaned dataframes
cleaned_dfs_with_names = []

# Iterate over each cleaned dataframe in cleaned_dfs
for df in input_dfs:
    # Rename the columns
    df = df.rename(columns=dict(enumerate(columns)))
    # Append the renamed dataframe to the new list
    cleaned_dfs_with_names.append(df)

In [5]:
# Merge qrels and input dataframe
merged_df = []
for df in cleaned_dfs_with_names:
    cleaned_df_with_qrel = pd.merge(df, qrels_df, on=['query_id','doc_id'], how='left')
    cleaned_df_with_qrel['is_relevant'] = cleaned_df_with_qrel['is_relevant'].fillna(0)
    merged_df.append(cleaned_df_with_qrel)

In [6]:
# Top Precision Calculation
top_10_precisions = []
top_100_precisions = []

def calculate_precision_k(x, k):
    return np.sum(x['is_relevant'].head(k) == 1) / k

for index, df in enumerate(merged_df):
    # group by query_id
    grouped = df.groupby('query_id')

    # calculate precision for top 10 and top 100
    precision_10 = grouped.apply(lambda x: calculate_precision_k(x, 10))
    precision_100 = grouped.apply(lambda x: calculate_precision_k(x, 100))

    top_10_precisions.append(precision_10)
    top_100_precisions.append(precision_100)


In [7]:
# Define system name
column_names = [df['system_name'].iloc[0] for df in merged_df]

precision10_results_df = pd.concat([pd.DataFrame(top_10_precisions).T],  axis=1)
precision10_results_df.columns = [name for i, name in enumerate(column_names)]

precision100_results_df = pd.concat([pd.DataFrame(top_100_precisions).T],  axis=1)
precision100_results_df.columns = [name for i, name in enumerate(column_names)]

In [8]:
### Calculate overall value

# Calculate the overall precision@10 by topic
overall_precision_by_topic_10 = precision10_results_df.mean(axis=1)
precision10_results_df['Overall (Topic)'] = overall_precision_by_topic_10

overall_precision_by_topic_100 = precision100_results_df.mean(axis=1)
precision100_results_df['Overall (Topic)'] = overall_precision_by_topic_100

# Calculate the overall precision@10 by system
overall_precision_by_system_10 = precision10_results_df.mean(axis=0)
precision10_results_df.loc["Overall (System)"] = overall_precision_by_system_10

overall_precision_by_system_100 = precision100_results_df.mean(axis=0)
precision100_results_df.loc["Overall (System)"] = overall_precision_by_system_100

# Store
precision10_results_df.to_csv('../outputs/topPrecision@10.csv')
precision100_results_df.to_csv('../outputs/topPrecision@100.csv')