In [1]:
# Dependencies
!pip install pandas
import pandas as pd
import numpy as np
import glob

zsh:1: /opt/homebrew/bin/pip: bad interpreter: /opt/homebrew/opt/python@3.9/bin/python3.9: no such file or directory


In [2]:
# Load the qrels file
qrels_file_path = '../qrels.trec8.adhoc.csv'
qrels_df = pd.read_csv(qrels_file_path, sep=' ', header=None, names=['query_id', 'ignore', 'doc_id', 'is_relevant'])

In [3]:
# Load input files
input_files = glob.glob('../inputs/*.csv')

# Create an empty list to store the input files dataframes
input_dfs = []

# Loop through each input CSV file
for filename in input_files:
    # Read the input CSV file into a dataframe
    df = pd.read_csv(filename, sep='\t', header=None)
    # Append the dataframe to the list of dataframes
    input_dfs.append(df)
    
raw_data = pd.concat(input_dfs, ignore_index=True)

In [4]:
# Name input dataframe column name
columns = ['query_id', 'ignore', 'doc_id', 'rank', 'score', 'system_name']

# Create an empty list to store the cleaned dataframes
cleaned_dfs_with_names = []

# Iterate over each cleaned dataframe in cleaned_dfs
for df in input_dfs:
    # Rename the columns
    df = df.rename(columns=dict(enumerate(columns)))
    # Append the renamed dataframe to the new list
    cleaned_dfs_with_names.append(df)

In [5]:
# Merge qrels and input dataframe
merged_df = []
for df in cleaned_dfs_with_names:
    cleaned_df_with_qrel = pd.merge(df, qrels_df, on=['query_id','doc_id'], how='left')
    cleaned_df_with_qrel['is_relevant'] = cleaned_df_with_qrel['is_relevant'].fillna(0)
    merged_df.append(cleaned_df_with_qrel)

In [6]:
# MAP Calculation
ap_depth_5 = []
ap_depth_10 = []
ap_depth_20 = []

def average_precision_at_k(x, depth):
    is_relevant = x['is_relevant'].head(depth) == 1
    precision = np.cumsum(is_relevant) / np.arange(1, len(is_relevant) + 1)
    average_precision = np.sum(precision * is_relevant) / np.minimum(depth, np.sum(is_relevant))
    return  np.nan_to_num(average_precision, nan=0.00000)

for index, df in enumerate(merged_df):
    # group by query_id
    grouped = df.groupby('query_id')

    # calculate average precision
    avg_precisions_5 = grouped.apply(lambda x: average_precision_at_k(x, 5))
    ap_depth_5.append(avg_precisions_5)
    
    avg_precisions_10 = grouped.apply(lambda x: average_precision_at_k(x, 10))
    ap_depth_10.append(avg_precisions_10)
    
    avg_precisions_20 = grouped.apply(lambda x: average_precision_at_k(x, 20))
    ap_depth_20.append(avg_precisions_20)

  average_precision = np.sum(precision * is_relevant) / np.minimum(k, np.sum(is_relevant))


In [7]:
# Define system name
column_names = [df['system_name'].iloc[0] for df in merged_df]

ap_5_results_df = pd.concat([pd.DataFrame(ap_depth_5).T],  axis=1)
ap_5_results_df.columns = [name for i, name in enumerate(column_names)]

ap_10_results_df = pd.concat([pd.DataFrame(ap_depth_10).T],  axis=1)
ap_10_results_df.columns = [name for i, name in enumerate(column_names)]

ap_20_results_df = pd.concat([pd.DataFrame(ap_depth_20).T],  axis=1)
ap_20_results_df.columns = [name for i, name in enumerate(column_names)]

In [8]:
### Calculate overall value

# Calculate the overall MAP by topic
overall_ap_5_by_topic = ap_5_results_df.mean(axis=1)
ap_5_results_df['Overall (Topic)'] = overall_ap_5_by_topic

overall_ap_10_by_topic = ap_10_results_df.mean(axis=1)
ap_10_results_df['Overall (Topic)'] = overall_ap_10_by_topic

overall_ap_20_by_topic = ap_20_results_df.mean(axis=1)
ap_20_results_df['Overall (Topic)'] = overall_ap_20_by_topic

# Calculate the overall MAP by system
overall_ap_5_by_system = ap_5_results_df.mean(axis=0)
ap_5_results_df.loc["Overall (System)"] = overall_ap_5_by_system

overall_ap_10_by_system = ap_10_results_df.mean(axis=0)
ap_10_results_df.loc["Overall (System)"] = overall_ap_10_by_system

overall_ap_20_by_system = ap_20_results_df.mean(axis=0)
ap_20_results_df.loc["Overall (System)"] = overall_ap_20_by_system

# Store
ap_5_results_df.to_csv('../outputs/map@5.csv')
ap_10_results_df.to_csv('../outputs/map@10.csv')
ap_20_results_df.to_csv('../outputs/map@20.csv')