In [1]:
import pandas as pd
import os
import sys
from sklearn.model_selection import train_test_split
import json
import zipfile

In [23]:
dataset_fname = 'marco-train-100k-gpt3.5.json'
original_dataset_dir = "./../../data/runs/training/rankgpt-sun/"
eval_dataset_dir = "./../../data/runs/training/rankgpt-sun-eval/"
train_dataset_dir = "./../../data/runs/training/rankgpt-sun-train/"
processed_dataset_path = "./../../data/runs/training/rankgpt-sun/__rankgptsun__msmarco-passage-train-judged.run"
train_dataset_fpath = "./../../data/runs/training/rankgpt-sun-train/__rankgptsun__msmarco-passage-train-judged.run"
eval_dataset_fpath = "./../../data/runs/training/rankgpt-sun-eval/__rankgptsun__msmarco-passage-train-judged.run"

eval_size = 0.1
seed = 42

In [3]:
def load_rankgpt_outputs(fpath: str):
    with open(fpath, 'r') as file:
        data = json.load(file)
    return data

In [4]:
retrieved_docs_per_query = {}

def load_rankgpt_dataset(zip_fpath, jsonl_fname):
    with zipfile.ZipFile(zip_fpath, 'r') as z: # open zip
        with z.open(jsonl_fname) as jsonl_file: # open jsonl
            q_num = 0
            for line in jsonl_file:
                q_num = q_num + 1
                json_obj = json.loads(line.decode('utf-8'))
                for doc in json_obj["retrieved_passages"]:
                    if q_num not in retrieved_docs_per_query:
                        retrieved_docs_per_query[q_num] = {"qid": json_obj["query_id"], "retrieved_docs": {}}
                    
                    rank = doc["rank"]
                    docid = doc["docid"]
                    retrieved_docs_per_query[q_num]["retrieved_docs"][rank] = docid

In [5]:
def from_rankgpt_output_to_list(formatted_string: str, q_num: int):
    list_of_strings = formatted_string.replace('[', '').replace(']', '').split(' > ')
    list_of_integers = []
    for num in list_of_strings: # form list of strings, to list of int
        try:
            list_of_integers.append(int(num))
        except ValueError:
            print(f"Invalid entry for {q_num}-th query: ", formatted_string)
            return [] # skip the invalid entry
        
    if len(list_of_integers) != 20:
        print(f"Invalid entry for {q_num}-th query: ", formatted_string)
        return [] # skip the invalid entry
    return list_of_integers

In [6]:
zip_file_path = original_dataset_dir+ "marco-train-100k.jsonl.zip"
jsonl_file_name = "marco-train-100k.jsonl"
load_rankgpt_dataset(zip_file_path, jsonl_file_name)

In [7]:
# load outputs from rankgpt
rankgpt_outputs_fpath = original_dataset_dir+dataset_fname
rankgpt_outputs = load_rankgpt_outputs(rankgpt_outputs_fpath)
# an example of output is:
# '[5] > [18] > [14] > [1] > [15] > [3] > [4] > [13] > [7] > [6] > [12] > [9] > [10] > [11] > [16] > [2] > [19] > [20] > [8] > [17]'

In [8]:
all_run_entries = []

In [9]:
for i, rankgpt_output in enumerate(rankgpt_outputs):
    q_num = i +1
    
    #print(f"processing data[{q_num}]=", rankgpt_output)
    run = from_rankgpt_output_to_list(rankgpt_output, q_num)
    #print("parsed run= ", run)
    
    for rank, retrieved_docid in enumerate(run):
        rank = rank +1
        score = 1/rank
        true_docid = retrieved_docs_per_query[q_num]["retrieved_docs"][retrieved_docid]
        true_qid = retrieved_docs_per_query[q_num]["qid"]
        csv_entry =  [true_qid, 0, true_docid, rank, 1/rank, "RankGPT3.5"]
        all_run_entries.append(csv_entry)
        
    if q_num % 10000 == 0:
        print(f"processed {q_num} queries")
        #print("qid: ", true_qid, "\tret_id: ", retrieved_docid, "\trank: ", rank, "\tscore: ", score, "\ttrue_docid: ", true_docid)

Invalid entry for 9-th query:  [2] > [4] > [1] > [6] > [7] > [8] > [9] > [15] > [18] > [16] > [11] > [10] > [13] > [5] > [19] > [12] > [20] > [14] > [3]
Invalid entry for 27-th query:  [1] > [2] > [4] > [5] > [6] > [3] > [7] > [8] > [9] > [11] > [12] > [13] > [15] > [17] > [18] > [19] > [14] > [20] > [10]
Invalid entry for 33-th query:  [1] > [2] > [3] > [4] > [5] > [6] > [7] > [9] > [10] > [11] > [12] > [13] > [14] > [16] > [17] > [15] > [18] > [19] > [20]
Invalid entry for 37-th query:  [7] > [13] > [15] > [20] > [2] > [3] > [1] > [5] > [6] > [9] > [8] > [14] > [11] > [12] > [18] > [17] > [10] > [4] > [19]
Invalid entry for 48-th query:  [1] > [5] > [2] > [3] > [11] > [20] > [4] > [6] > [7] > [13] > [18] > [19] > [9] > [14] > [12] > [10] > [17] > [15] > [16]
Invalid entry for 50-th query:  [16] > [19] > [5] > [3] > [7] > [8] > [14] > [12] > [9] > [6] > [2] > [1] > [10] > [17] > [15] > [4] > [18] > [11] > [20]
Invalid entry for 71-th query:  [2] > [3] > [9] > [10] > [12] > [13] > [15]

In [10]:
len(all_run_entries)/20

90699.0

In [11]:
def save_dataset_to_file(dataset_df, fpath):
    dataset_df.to_csv(fpath, index=False, sep='\t', header=False)

In [12]:
df = pd.DataFrame(all_run_entries)
df.head

<bound method NDFrame.head of               0  1        2   3         4           5
0        653897  0  8600899   1  1.000000  RankGPT3.5
1        653897  0  4008463   2  0.500000  RankGPT3.5
2        653897  0  2234858   3  0.333333  RankGPT3.5
3        653897  0  2028777   4  0.250000  RankGPT3.5
4        653897  0  8722220   5  0.200000  RankGPT3.5
...         ... ..      ...  ..       ...         ...
1813975  802131  0  7266764  16  0.062500  RankGPT3.5
1813976  802131  0  8622644  17  0.058824  RankGPT3.5
1813977  802131  0  3712400  18  0.055556  RankGPT3.5
1813978  802131  0  7266762  19  0.052632  RankGPT3.5
1813979  802131  0  4569218  20  0.050000  RankGPT3.5

[1813980 rows x 6 columns]>

In [13]:
grouped_counts = df.groupby([0]).size().reset_index(name='Count')

In [14]:
filtered_groups = grouped_counts[grouped_counts['Count'] < 20] # search queries with less than 20 scored documents
filtered_groups

Unnamed: 0,0,Count


In [22]:
save_dataset_to_file(df, processed_dataset_path)

In [18]:
def split_train_eval(to_split_df, eval_size=0.01, seed=42):
    qids = to_split_df.iloc[:, 0].unique()
    print(f"Number of queries in the dataset: {len(qids)}")
    # split the dataset in 99%/1% train/eval splits
    train_df, eval_df = train_test_split(qids, test_size=eval_size, random_state=seed)

    print(f"Training set shape: {train_df.shape[0]}")
    print(f"Evaluation set shape: {eval_df.shape[0]}")
    
    train_dataset = to_split_df[to_split_df.iloc[:, 0].isin(train_df)]
    print("Queries selected in the training dataset: ", int(train_dataset.shape[0]/20))
    
    eval_dataset = to_split_df[to_split_df.iloc[:, 0].isin(eval_df)]
    print("Queries selected in the evaluation dataset: ", int(eval_dataset.shape[0]/20))
    
    return train_dataset, eval_dataset

In [19]:
train_dataset, eval_dataset = split_train_eval(df, eval_size, seed)

Number of queries in the dataset: 90699
Training set shape: 89792
Evaluation set shape: 907
Queries selected in the training dataset:  89792
Queries selected in the evaluation dataset:  907


In [24]:
save_dataset_to_file(eval_dataset, eval_dataset_fpath)

In [25]:
save_dataset_to_file(train_dataset, train_dataset_fpath)