# Data Preparation

## Converting Summaries to input.jsonl
This section creates a new JSONL in the same format as `inputs.jsonl` (fields: `query_id`, `doc_id`, `query`, `text`). It preserves `query_id`, `doc_id`, and `query` from the original inputs and replaces `text` with the summary from the summaries JSONL, matched on `doc_id`.

In [None]:
from pathlib import Path
import json
import pandas as pd
import os

# Paths (edit if needed)
dataset_name = 'dl-2020'
tokens_file = '80tokens'
inputs_path = Path(f'../data/msmarco-passage-trec-{dataset_name}-judged/inputs.jsonl')
summaries_path = Path(f'summarisation_outputs/gpt_summaries_{dataset_name}_gpt-4o_{tokens_file}.jsonl')
output_path = Path(f'../data/msmarco-passage-trec-{dataset_name}-judged/inputs_from_summaries_{dataset_name}_{tokens_file}.jsonl')

# Load inputs.jsonl
inputs = pd.read_json(inputs_path, lines=True, dtype={'doc_id': str, 'query_id': str})
inputs['doc_id'] = inputs['doc_id'].astype(str)

# Load summaries JSONL (expects doc_id and summarisation_result)
sums = pd.read_json(summaries_path, lines=True, dtype={'doc_id': str})
sums['doc_id'] = sums['doc_id'].astype(str)

# Pick the text source column from summaries
text_col = "summarisation_result"

# Reduce summaries to a single row per doc_id to avoid many-to-many expansion
# Prefer non-null values and keep the longest summary when duplicates exist.
sums_clean = (
    sums.dropna(subset=[text_col])
        .assign(_len=sums[text_col].astype(str).str.len())
        .sort_values(['doc_id', '_len'])
        .drop_duplicates(subset=['doc_id'], keep='last')
        .drop(columns=['_len'])
        [['doc_id', text_col]]
)

# Perform a validated left merge (inputs may have many rows per doc_id, summaries must be one)
try:
    merged = inputs.merge(sums_clean, on='doc_id', how='left', validate='many_to_one')
except Exception as e:
    print(f"Warning: many-to-one validation failed ({e}); proceeding with de-duplicated merge.")
    merged = inputs.merge(sums_clean, on='doc_id', how='left')

# Sanity check: row count should not increase
print(f"Row count before: {len(inputs)}, after merge: {len(merged)}")

missing = merged[text_col].isna().sum()
print(f"Summaries matched: {len(merged) - missing} / {len(merged)}; missing: {missing}")

# Build records in inputs format
records_df = merged.assign(text_summary=merged[text_col])[['query_id','doc_id','query','text_summary','text']]

# Optional: keep only rows with a summary
# records_df = records_df.dropna(subset=['text_summary'])

records = records_df.to_dict(orient='records')

# Write JSONL
with output_path.open('w', encoding='utf-8') as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False))
        f.write('\n')

print(f"Wrote: {output_path}")

Row count before: 11386, after merge: 11386
Summaries matched: 11386 / 11386; missing: 0
Wrote: ../data/msmarco-passage-trec-dl-2020-judged/inputs_from_summaries_dl-2020_80tokens.jsonl
