# Download and Inspect the Collection

The dataset was created from the Chronicling America collection ‚Äî over 21 million digitized newspaper pages (1756‚Äì1963) curated by the Library of Congress and NEH. They used 39,330 pages (1800‚Äì1920), representing 53 US states, to ensure wide geographic and temporal coverage.

Source: https://dl.acm.org/doi/pdf/10.1145/3626772.3657891

GitHub: https://github.com/DataScienceUIBK/ChroniclingAmericaQA?tab=readme-ov-file

In [None]:
%pip install -r requirements.txt


In [None]:
# Imports
import os
import pandas as pd
import pyterrier as pt
import transformers
import torch
import nltk
import spacy

In [None]:
import os
os.makedirs("data", exist_ok=True)

!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/test.json?download=true" -o data/test.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/train.json?download=true" -o data/train.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/dev.json?download=true" -o data/validation.json

import json

files = ["data/train.json", "data/validation.json", "data/test.json"]

for path in files:
    print(f"\n===== {path} =====")
    try:
        with open(path, "r", encoding="utf-8") as f:
            # Read a few hundred characters to see what kind of JSON it is
            head = f.read(500)
            print("Preview of first 500 characters:\n")
            print(head[:500])
        # Try to load only part of the file
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            print(f"\nLoaded {len(data)} items (list).")
            print("Dictionary keys:", list(data[0].keys()))
            print(json.dumps(data[0], indent=2)[:600])
        elif isinstance(data, dict):
            print("\nTop-level is a dictionary. Keys:", list(data.keys()))
            for k, v in data.items():
                if isinstance(v, list):
                    print(f"Key '{k}' contains a list of {len(v)} items.")
                    if v:
                        print("First item keys:", list(v[0].keys()))
                        print(json.dumps(v[0], indent=2)[:600])
                        break
        else:
            print(f"Unexpected top-level type: {type(data)}")
    except Exception as e:
        print(f"Could not parse {path} as JSON: {e}")

# Create the Document Collection

To do that, we create a new json file that contains the 'para_id', 'context', 'raw_ocr', 'publication_date' keys, for all para_id in the collection.

para_id: is the id of a paragraph of a news paper page.

In [None]:
import json
import os

inputs = ["data/train.json", "data/validation.json", "data/test.json"]
output = "data/document_collection.json"

def load_list_or_empty(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"Skipping {path} because it is missing or empty")
        return []
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        print(f"Skipping {path} because it is not a list at the top level")
        return []
    except json.JSONDecodeError:
        print(f"Skipping {path} because it is not valid JSON")
        return []

def project(recs):
    out = []
    for r in recs:
        out.append({
            "para_id": r.get("para_id", ""),
            "context": r.get("context", ""),
            "raw_ocr": r.get("raw_ocr", ""),
            "publication_date": r.get("publication_date", "")
        })
    return out

all_recs = []
for p in inputs:
    recs = load_list_or_empty(p)
    print(f"Loaded {len(recs)} records from {p}")
    all_recs.extend(project(recs))

# deduplicate by para_id keeping the first one seen
uniq = {}
for rec in all_recs:
    pid = rec.get("para_id", "")
    if pid and pid not in uniq:
        uniq[pid] = rec

result = list(uniq.values())

with open(output, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(result)} records to {output}")
print(json.dumps(result[:3], indent=2))

## You should check that the collection you have matches that of the paper!

In [None]:
import pandas as pd
for path in inputs:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
        df_check = pd.read_json(path)
        print(f'Shape of {path}: {df_check.shape}')

The dimensions match the ones of the paper at https://github.com/DataScienceUIBK/ChroniclingAmericaQA

# Create the Test Queries Data Structure

We keep the first 10.000 queries due to memory errors in the free colab version.

To be comparable, please keep the top 10.000 queries for evaluation.

In [None]:
import json
import re
import unicodedata
import string

input_file = "data/test.json"
output_file = "data/test_queries.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_question(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Extract and clean
queries = [
    {
        "query_id": item.get("query_id", ""),
        "question": clean_question(item.get("question", "")),
    }
    for item in data
]

# Sort by query_id (assuming numeric)
queries = sorted(queries, key=lambda x: int(x["query_id"]) if str(x["query_id"]).isdigit() else x["query_id"])

# Keep only the first 10,000
queries = queries[:10000]

# Save new JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print(f"Saved {len(queries)} entries to {output_file}")
print(json.dumps(queries[:3], indent=2))

# Create the Qrels for the test set

In [None]:
input_file = "data/test.json"
qrels_file = "data/test_qrels.json"
answers_file = "data/test_query_answers.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build the qrels file: query_id, iteration=0, para_id, relevance=1
qrels = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1
    }
    for item in data
]

# Build the query_answers file: same plus answer and org_answer
query_answers = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1,
        "answer": item.get("answer", ""),
        "org_answer": item.get("org_answer", "")
    }
    for item in data
]

# Save both files
with open(qrels_file, "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

with open(answers_file, "w", encoding="utf-8") as f:
    json.dump(query_answers, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qrels)} entries to {qrels_file}")
print(f"Saved {len(query_answers)} entries to {answers_file}")
print("Sample qrels entry:", qrels[0])
print("Sample query_answers entry:", query_answers[0])

# Retrieval

### Extract data from json files

In [None]:
input_files = ['data/document_collection.json', 'data/test.json', 'data/test_qrels.json', 'data/test_queries.json', 'data/test_query_answers.json', 'data/train.json', 'data/validation.json']

dataframes = {}
for input_file in input_files:
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        dataframes[input_file] = pd.read_json(input_file)

In [None]:
dataframes['data/document_collection.json']

In [None]:
dataframes['data/train.json']

**NOTE: in `data/document_collection.json` the rows are already deduplicated**

### _Preprocessing_

#### **Linguistic Processing**

##### Normalization
We lowercase everything and remove all special characters/tags

--> 1st step normalization

In [None]:
def normalize_text1(text):
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize('NFKC', text)
    #text = text.lower()
    text = re.sub(r'<[^>]+>', ' ', text) # HTML
    # text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip() # multiple white spaces
    return text

# in caso togliessimo la NER vanno tolti i commenti nella funzione qui sopra

docColl = dataframes['data/document_collection.json']
docColl_contNorm1 = docColl['context'].apply(normalize_text1)
docColl_ocrNorm1 = docColl['raw_ocr'].apply(normalize_text1)
docColl_Norm1 = docColl.copy()

In [None]:
docColl_Norm1['context'] = docColl_contNorm1
docColl_Norm1['raw_ocr'] = docColl_ocrNorm1
docColl_Norm1.head(25)

In [None]:
docColl['context'].compare(docColl_Norm1['context'])

In [None]:
print(docColl['context'].iloc[2])
print(docColl_Norm1['context'].iloc[2])

##### NER
We want to identify named-entities before lemmatizing the text, so that we do not lose any entity by "shrinking" words to their base forms.

In [None]:
from transformers import AutoTokenizer, pipeline

MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

ner_pipeline = pipeline(model=MODEL_NAME, tokenizer=tokenizer, trust_remote_code=True, device="cpu")

results_cont = []
results_ocr = []
for index, row in docColl_Norm1.iterrows():
    sentence_cont = str(row['context'])
    sentence_ocr = str(row['raw_ocr'])
    entities_cont = ner_pipeline(sentence_cont, tokens=sentence_cont.split())
    entities_ocr = ner_pipeline(sentence_ocr, tokens=sentence_ocr.split())
    results_cont.append(entities_cont)
    results_ocr.append(entities_ocr)
docColl_Norm1['ner_entities_context'] = results_cont
docColl_Norm1['ner_entities_ocr'] = results_ocr

In [None]:
docColl_ner = docColl_Norm1.copy()
docColl_Norm1[['context', 'raw_ocr', 'ner_entities_context', 'ner_entities_ocr']]

--> 2nd step normalization

In [None]:
def normalize_text2(text):
    if not isinstance(text, str):
        return text
    text = text.lower() # lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text) # punctuations
    text = re.sub(r'\s+', ' ', text).strip() # white spaces again
    return text

# da testare cos√¨,
# se va: cambiare anche normalize_text1
# se non va: scrivere questo apply(normalize_text2) diviso tra context e raw_ocr e poi riunire tutto su un dataframe unico
docColl_Norm2 = docColl_ner[['context', 'raw_ocr']].apply(normalize_text2)
#docColl_ocrNorm2 = docColl_ner['raw_ocr'].apply(normalize_text2)

##### Lemmatization
Placed here to standardize semantically the sentences in the documents

In [None]:
import spacy

try:
    if 'nlp' not in locals():
        nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

target_key = 'data/document_collection.json'
target_column = 'context' 

if target_key in dataframes:
    print(f"üöÄ Avvio lemmatizzazione ottimizzata (Batch Processing) su: {target_key}")
    df = dataframes[target_key]
    
    if target_column in df.columns:
        texts = df[target_column].astype(str).tolist()
        
        processed_texts = []
        
        print(f"Elaborazione di {len(texts)} documenti...")
        
        for doc in nlp.pipe(texts, batch_size=2000, n_process=-1):
            lemmas = [token.lemma_ for token in doc if not token.is_space]
            processed_texts.append(" ".join(lemmas))
            
        new_col_name = f"{target_column}_lemma"
        df[new_col_name] = processed_texts
        
        dataframes[target_key] = df
        
        print(f"‚úÖ Finito! Creata colonna: {new_col_name}")
        print(df[[target_column, new_col_name]].head())
        
    else:
        print(f"Errore: Colonna '{target_column}' non trovata. Controlla il nome esatto.")
else:
    print("DataFrame non trovato.")

##### N-gram based tokenization
Important to place it after normalization, in this tokenization can be integrated a NER-aware part so that "the tokenization is also entity-guided"

In [None]:

# DA INTEGRARE PER FARGLI FARE IL LAVORO ANCHE SULLA COLONNA RAW_OCR
def ner_aware_ngram_tokenizer(row, text_col='lemmatized_context', ner_col='ner_entities', n=2):
    """
    1. Prende il testo lemmatizzato.
    2. Usa le entit√† NER per 'incollare' le parole composte (New York -> new_york).
    3. Genera N-grams dal testo modificato.
    """
    text = row.get(text_col, "")
    entities = row.get(ner_col, [])
    
    if not isinstance(text, str) or not text.strip():
        return []
    
    # Entity Glueing (Incollaggio Entit√†)
    # Creiamo una versione del testo dove le entit√† sono unite da underscore.
    
    # Se abbiamo entit√†, proviamo a unirle nel testo
    if isinstance(entities, list) and len(entities) > 0:
        # Ordiniamo per lunghezza decrescente per evitare sostituzioni parziali
        try:

            entity_texts = []
            for ent in entities:
                if 'word' in ent:
                    entity_texts.append(ent['word'])
                elif 'entity_group' in ent:
                    entity_texts.append(ent['entity_group'])
                elif 'entity' in ent:
                    entity_texts.append(ent['entity']) 
            
            for ent_text in sorted(entity_texts, key=len, reverse=True):
                clean_ent = ent_text.lower().strip()
                if " " in clean_ent:
                    merged_ent = clean_ent.replace(" ", "_")
                    text = text.replace(clean_ent, merged_merged_ent)
        except Exception as e:
            pass

    # Tokenization Standard ---
    tokens = text.split() 
    
    # Generazione N-grams ---
    if len(tokens) < n:
        return []
        
    # Se n=2 (Bigrams): zip(tokens, tokens[1:])
    n_grams_tuples = zip(*[tokens[i:] for i in range(n)])
    
    # Unisce le tuple in stringhe: ("new_york", "is") -> "new_york is"
    n_grams_list = [" ".join(ngram) for ngram in n_grams_tuples]
    
    return n_grams_list

target_key = 'data/document_collection.json'
text_column = 'lemmatized_context' 
ner_column = 'ner_entities' 

if target_key in dataframes:
    print(f"Initiating N-gram Tokenization (Entity-Aware) on: {target_key}...")
    df = dataframes[target_key]
    
    if text_column in df.columns and ner_column in df.columns:
        
        N_VALUE = 2 
        
        print(f"Generating {N_VALUE}-grams...")
        
        df['ngrams'] = df.apply(
            lambda row: ner_aware_ngram_tokenizer(row, text_col=text_column, ner_col=ner_column, n=N_VALUE), 
            axis=1
        )

        dataframes[target_key] = df
        
        print(df[['lemmatized_context', 'ngrams']].head())
        
    else:
        print(f"Error: Columns '{text_column}' or '{ner_column}' missing. Check names.")
else:
    print(f"Error: {target_key} not found.")

da qui dovrebbe uscire il dataframe chiamato docColl_tok

### _Multi-field Indexing_

In [None]:
from collections import defaultdict

def create_multi_field_index(df):
    # The index structure: { field_name: { term: { doc_id: frequency } } }
    inverted_index = {
        "raw": defaultdict(lambda: defaultdict(int)),
        "clean": defaultdict(lambda: defaultdict(int)),
        "entities": defaultdict(lambda: defaultdict(int))
    }
    
    # Track document frequency (how many docs a term appears in)
    doc_counts = {
        "raw": defaultdict(int),
        "clean": defaultdict(int),
        "entities": defaultdict(int)
    }

    num_docs = len(df)

    for idx, row in df.iterrows():
        doc_id = idx # Using dataframe index as Document ID
        
        # --- Field 1: Raw (from raw_ocr) ---
        raw_tokens = str(row.get('raw_ocr', '')).lower().split()
        for token in raw_tokens:
            inverted_index["raw"][token][doc_id] += 1
            
        # --- Field 2: Clean (from context / lemmatized_context) ---
        clean_tokens = str(row.get('context', '')).lower().split()
        for token in clean_tokens:
            inverted_index["clean"][token][doc_id] += 1
            
        # --- Field 3: Entities (from ner_entities) ---
        # Extracts only the 'word' or 'entity' text from your NER results
        entities_list = row.get('ner_entities', [])
        if isinstance(entities_list, list):
            for ent in entities_list:
                # Handle different key structures found in your screenshots
                ent_text = ent.get('word') or ent.get('entity_group') or ent.get('entity')
                if ent_text:
                    term = ent_text.lower().strip().replace(" ", "_")
                    inverted_index["entities"][term][doc_id] += 1

    return inverted_index, num_docs

# Execute Indexing
df_target = dataframes['data/document_collection.json']
my_index, total_docs = create_multi_field_index(df_target)

--> Indexing con PyTerrier usando un generator

In [None]:
# qui assumiamo che le celle create dal NER siano oggetti di tipo dizionario
def createGenerator(df):
    for _, row in df.iterrows():
        # togliamo lOffset and rOffset
        clean_ents = []
        for ent in row['entity_col']:
            cleaned = {k: v for k, v in ent.items() if k not in ['lOffset', 'rOffset']}
            clean_ents.append(cleaned)

        search_terms = []
        for e in clean_ents:
            #search_terms.append(e.get('name', ''))
            #search_terms.append(e.get('title', ''))
            # da capire se vogliamo che siano searchable, dato che surface contiene gi√† il testo a cui √® associata la entity
            search_terms.append(e.get('surface', ''))

        ent_text = " ".join(filter(None, search_terms)) # questa riga ha senso solo se prendiamo anche 'name' e 'title'
                                                                                       # se no ent_text va assegnato a e.get('surface', ' ')

        meta_json = json.dumps(clean_ents) # facciamo diventare tutti i metadati una stringa in forma json (non un oggetto dizionario, proprio una stringa)

        yield { # serve per lo stream dei dati quando viene chiamata createGenerator dentro indexer.index(‚Ä¢)
            "docno": str(row['docno']),
            "text": row['text'],
            "entities": ent_text, # entit√† searchable
            "entity_json": meta_json}

indexer = pt.IterDictIndexer(
    "entity_index",
    fields=["text", "entities"],
    meta=["docno", "entity_json"])

index_ref = indexer.index(createGenerator(docColl_tok))


## Phase I

### **BM25 Retrieval from raw OCR (baseline 1)**

### **BM25 Retrieval from corrected OCR (baseline 2)**

### **BM25 Retrieval from both raw and corrected OCR using RRF formula (baseline 3)**