# Download and Inspect the Collection

The dataset was created from the Chronicling America collection — over 21 million digitized newspaper pages (1756–1963) curated by the Library of Congress and NEH. They used 39,330 pages (1800–1920), representing 53 US states, to ensure wide geographic and temporal coverage.

Source: https://dl.acm.org/doi/pdf/10.1145/3626772.3657891

GitHub: https://github.com/DataScienceUIBK/ChroniclingAmericaQA?tab=readme-ov-file

In [1]:
%pip install -r requirements.txt


Error processing line 1 of /Users/gabrielepinelli/miniconda3/lib/python3.10/site-packages/distutils-precedence.pth:

  Traceback (most recent call last):
    File "/Users/gabrielepinelli/miniconda3/lib/python3.10/site.py", line 195, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named '_distutils_hack'

Remainder of file ignored

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [36]:
# Imports
import os
import pandas as pd
import pyterrier as pt
import transformers
import torch
import nltk
import spacy
import shutil
import matplotlib

ModuleNotFoundError: No module named 'matplotlib'

In [3]:
import os
os.makedirs("data", exist_ok=True)

!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/test.json?download=true" -o data/test.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/train.json?download=true" -o data/train.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/dev.json?download=true" -o data/validation.json

import json

files = ["data/train.json", "data/validation.json", "data/test.json"]

for path in files:
    print(f"\n===== {path} =====")
    try:
        with open(path, "r", encoding="utf-8") as f:
            # Read a few hundred characters to see what kind of JSON it is
            head = f.read(500)
            print("Preview of first 500 characters:\n")
            print(head[:500])
        # Try to load only part of the file
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            print(f"\nLoaded {len(data)} items (list).")
            print("Dictionary keys:", list(data[0].keys()))
            print(json.dumps(data[0], indent=2)[:600])
        elif isinstance(data, dict):
            print("\nTop-level is a dictionary. Keys:", list(data.keys()))
            for k, v in data.items():
                if isinstance(v, list):
                    print(f"Key '{k}' contains a list of {len(v)} items.")
                    if v:
                        print("First item keys:", list(v[0].keys()))
                        print(json.dumps(v[0], indent=2)[:600])
                        break
        else:
            print(f"Unexpected top-level type: {type(data)}")
    except Exception as e:
        print(f"Could not parse {path} as JSON: {e}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1350  100  1350    0     0   4593      0 --:--:-- --:--:-- --:--:--  45910      0 --:--:-- --:--:-- --:--:--     0
100 71.5M  100 71.5M    0     0  48.2M      0  0:00:01  0:00:01 --:--:-- 37.1M0:00:01 --:--:--  101M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1358  100  1358    0     0   9738      0 --:--:-- --:--:-- --:--:--  9769
100 1315M  100 1315M    0     0   104M      0  0:00:12  0:00:12 --:--:--  107M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1350  100  1350    0     0   9903      0 --:--:-- --:--:-- --:--:--  9926
100 71.8M  100 71.8M    0     0  86.2M      0 --:--:-- --

# Create the Document Collection

To do that, we create a new json file that contains the 'para_id', 'context', 'raw_ocr', 'publication_date' keys, for all para_id in the collection.

para_id: is the id of a paragraph of a news paper page.

In [4]:
import json
import os

inputs = ["data/train.json", "data/validation.json", "data/test.json"]
output = "data/document_collection.json"

def load_list_or_empty(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"Skipping {path} because it is missing or empty")
        return []
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        print(f"Skipping {path} because it is not a list at the top level")
        return []
    except json.JSONDecodeError:
        print(f"Skipping {path} because it is not valid JSON")
        return []

def project(recs):
    out = []
    for r in recs:
        out.append({
            "para_id": r.get("para_id", ""),
            "context": r.get("context", ""),
            "raw_ocr": r.get("raw_ocr", ""),
            "publication_date": r.get("publication_date", "")
        })
    return out

all_recs = []
for p in inputs:
    recs = load_list_or_empty(p)
    print(f"Loaded {len(recs)} records from {p}")
    all_recs.extend(project(recs))

# deduplicate by para_id keeping the first one seen
uniq = {}
for rec in all_recs:
    pid = rec.get("para_id", "")
    if pid and pid not in uniq:
        uniq[pid] = rec

result = list(uniq.values())

with open(output, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(result)} records to {output}")
print(json.dumps(result[:3], indent=2))

Loaded 439302 records from data/train.json
Loaded 24111 records from data/validation.json
Loaded 24084 records from data/test.json
Wrote 131921 records to data/document_collection.json
[
  {
    "para_id": "New_Hampshire_18070804_1",
    "context": "Aiscellaneous Repository. From the Albany Register, WAR, OR A PROSPECT OF IT, From recent instances of British Outrage. BY: WILLIAM RAY, Author of the contemplated publication, entitled, \u201cHorrors of Slavery, or the American Turf in Tripoli,\u201d VOTARIES of Freedom, arm! The British Lion roars! Legions of Valor, take th\u2019 alarm\u2014; Rash, rush to guard our shores! Behold the horrid deed\u2014 Your brethren gasping lie! Beneath a tyrant\u2019s hand they bleed\u2014 They groan\u2014they faint\u2014they die. Veterans of seventy-six, Awake the slumbering sword;\u2014 Hearts of your murderous foes transfix\u2014 'Tis vengeance gives the word. Remember Lexington, And Bunker\u2019s tragic hill; \u201cThe same who spilt your blood there

## You should check that the collection you have matches that of the paper!

In [5]:
import pandas as pd
for path in inputs:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
        df_check = pd.read_json(path)
        print(f'Shape of {path}: {df_check.shape}')

Shape of data/train.json: (439302, 11)
Shape of data/validation.json: (24111, 11)
Shape of data/test.json: (24084, 11)


The dimensions match the ones of the paper at https://github.com/DataScienceUIBK/ChroniclingAmericaQA

# Create the Test Queries Data Structure

We keep the first 10.000 queries due to memory errors in the free colab version.

To be comparable, please keep the top 10.000 queries for evaluation.

In [6]:
import json
import re
import unicodedata
import string

input_file = "data/test.json"
output_file = "data/test_queries.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_question(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Extract and clean
queries = [
    {
        "query_id": item.get("query_id", ""),
        "question": clean_question(item.get("question", "")),
    }
    for item in data
]

# Sort by query_id (assuming numeric)
queries = sorted(queries, key=lambda x: int(x["query_id"]) if str(x["query_id"]).isdigit() else x["query_id"])

# Keep only the first 10,000
queries = queries[:10000]

# Save new JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print(f"Saved {len(queries)} entries to {output_file}")
print(json.dumps(queries[:3], indent=2))

Saved 10000 entries to data/test_queries.json
[
  {
    "query_id": "test_1",
    "question": "How many lots did Thomas Peirce have"
  },
  {
    "query_id": "test_10",
    "question": "Who gave Hamilton the substance of what he had proposed on the part of General Hamilton"
  },
  {
    "query_id": "test_100",
    "question": "Who informs his FRIENDS and the PUBLIC that he has taken that justly celebrated INN in this city"
  }
]


# Create the Qrels for the test set

In [7]:
input_file = "data/test.json"
qrels_file = "data/test_qrels.json"
answers_file = "data/test_query_answers.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build the qrels file: query_id, iteration=0, para_id, relevance=1
qrels = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1
    }
    for item in data
]

# Build the query_answers file: same plus answer and org_answer
query_answers = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1,
        "answer": item.get("answer", ""),
        "org_answer": item.get("org_answer", "")
    }
    for item in data
]

# Save both files
with open(qrels_file, "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

with open(answers_file, "w", encoding="utf-8") as f:
    json.dump(query_answers, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qrels)} entries to {qrels_file}")
print(f"Saved {len(query_answers)} entries to {answers_file}")
print("Sample qrels entry:", qrels[0])
print("Sample query_answers entry:", query_answers[0])

Saved 24084 entries to data/test_qrels.json
Saved 24084 entries to data/test_query_answers.json
Sample qrels entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1}
Sample query_answers entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1, 'answer': '183', 'org_answer': '183'}


# Retrieval

### Extract data from json files

In [8]:
input_files = ['data/document_collection.json', 'data/test.json', 'data/test_qrels.json', 'data/test_queries.json', 'data/test_query_answers.json', 'data/train.json', 'data/validation.json']

dataframes = {}
for input_file in input_files:
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        dataframes[input_file] = pd.read_json(input_file)

Let's visualize data and analyze them

In [9]:
dataframes['data/document_collection.json']

Unnamed: 0,para_id,context,raw_ocr,publication_date
0,New_Hampshire_18070804_1,Aiscellaneous Repository. From the Albany Regi...,fAiscellancous Bepogitory.\n. dvom the Albany ...,1807-08-04
1,New_Hampshire_18070804_4,Surely he above the rest of his fellow mortals...,Surely he a\nbove the rest of his fellow morta...,1807-08-04
2,New_Hampshire_18070804_5,"At Westmoreland, Mrs. Sally Lincoln, wife of M...","At Weltmoreland, Mrs. Sally Liacoln, wife\n~of...",1807-08-04
3,New_Hampshire_18070804_8,Upon the correction of this remedy the stomach...,tion of this remedy the flomach is invariably\...,1807-08-04
4,New_Hampshire_18070804_9,"Also FOR SALE AS ABOVE, NEW GOODS, STEPHEN HAR...","*°\n, ALSO POR SALE AS ABOVE,\no NEW-GEODS, -\...",1807-08-04
...,...,...,...,...
131916,Nebraska_19130626_7,"""Did you?” said Fran politely. “So father grad...","""Did you?” said Fran politely. “So\nfather gra...",1913-06-26
131917,Indiana_19170719_6,"When a boy begins to learn a trade, the ""play ...","When a boy begins to learn a trade, the ""play\...",1917-07-19
131918,Kentucky_19110727_5,It is situated in the valley of the great many...,It is ftltuaiod In tho val\nley of Uio great n...,1911-07-27
131919,Rhode_Island_19140626_10,"A PRACTICAL LESSON IN AGRICULTURE, MAY 1708, T...","A PRACITICA, ZESSQN 2V AGRICTZ,\nLEFY 170 RIGH...",1914-06-26


In [10]:
dataframes['data/train.json']

Unnamed: 0,query_id,question,answer,org_answer,para_id,context,raw_ocr,publication_date,trans_que,trans_ans,url
0,train_1,"Who is the author of the book, ""Horrors of Sla...",WILLIAM RAY,WILLIAM RAY,New_Hampshire_18070804_1,Aiscellaneous Repository. From the Albany Regi...,fAiscellancous Bepogitory.\n. dvom the Albany ...,1807-08-04,0,0,https://chroniclingamerica.loc.gov/lccn/sn8302...
1,train_2,Who was the Grand Officer of the Legion of Honor?,de Rosemberg,de Rosemberg,New_Hampshire_18070804_4,Surely he above the rest of his fellow mortals...,Surely he a\nbove the rest of his fellow morta...,1807-08-04,0,0,https://chroniclingamerica.loc.gov/lccn/sn8302...
2,train_3,What country was Gen. de Rosemberg formerly Ma...,France,France,New_Hampshire_18070804_4,Surely he above the rest of his fellow mortals...,Surely he a\nbove the rest of his fellow morta...,1807-08-04,0,0,https://chroniclingamerica.loc.gov/lccn/sn8302...
3,train_4,What was the title of Rev. Joseph McKean?,de Rosemberg,de Rosemberg,New_Hampshire_18070804_4,Surely he above the rest of his fellow mortals...,Surely he a\nbove the rest of his fellow morta...,1807-08-04,0,0,https://chroniclingamerica.loc.gov/lccn/sn8302...
4,train_5,Who was the wife of Mr. Spencer L. at Westmore...,Sally Lincoln,Sally Lincoln,New_Hampshire_18070804_5,"At Westmoreland, Mrs. Sally Lincoln, wife of M...","At Weltmoreland, Mrs. Sally Liacoln, wife\n~of...",1807-08-04,0,0,https://chroniclingamerica.loc.gov/lccn/sn8302...
...,...,...,...,...,...,...,...,...,...,...,...
439297,train_439298,Who is the pastor of First Baptist Church?,W. R. Bradshaw,W. R. Bradshaw,North_Carolina_19181130_6,"He went to school in France, received his comm...","He went\nto school in' France, received his\nc...",1918-11-30,0,0,https://chroniclingamerica.loc.gov/lccn/sn9106...
439298,train_439299,What day of the week is the school for Arthur ...,"November 24,1918",Sunday,North_Carolina_19181130_6,"He went to school in France, received his comm...","He went\nto school in' France, received his\nc...",1918-11-30,0,1,https://chroniclingamerica.loc.gov/lccn/sn9106...
439299,train_439300,How long has CHICHESTER'S DIAMOND BRAND PILLS ...,25 years,25 years,North_Carolina_19181130_7,There are plenty of such gifts; you'll find lo...,There are plenty of such gifts; yon'll find\nl...,1918-11-30,0,0,https://chroniclingamerica.loc.gov/lccn/sn9106...
439300,train_439301,On what day of the week is Trinity Lutheran sc...,"November 24,1918",Sunday,North_Carolina_19181130_7,There are plenty of such gifts; you'll find lo...,There are plenty of such gifts; yon'll find\nl...,1918-11-30,0,1,https://chroniclingamerica.loc.gov/lccn/sn9106...


**NOTE: in `data/document_collection.json` the rows are already deduplicated**

### _Preprocessing_

#### **Linguistic Processing**

##### Normalization
We lowercase everything and remove all special characters/tags

--> 1st step normalization

In [11]:
def normalize_text1(text):
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize('NFKC', text)
    #text = text.lower()
    text = re.sub(r'<[^>]+>', ' ', text) # HTML
    # text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip() # multiple white spaces
    return text

# in caso togliessimo la NER vanno tolti i commenti nella funzione qui sopra

docColl = dataframes['data/document_collection.json']
docColl_contNorm1 = docColl['context'].apply(normalize_text1)
docColl_ocrNorm1 = docColl['raw_ocr'].apply(normalize_text1)
docColl_Norm1 = docColl.copy()

In [12]:
docColl_Norm1['context'] = docColl_contNorm1
docColl_Norm1['raw_ocr'] = docColl_ocrNorm1
docColl_Norm1.head(25)

Unnamed: 0,para_id,context,raw_ocr,publication_date
0,New_Hampshire_18070804_1,Aiscellaneous Repository. From the Albany Regi...,fAiscellancous Bepogitory. . dvom the Albany R...,1807-08-04
1,New_Hampshire_18070804_4,Surely he above the rest of his fellow mortals...,Surely he a bove the rest of his fellow mortal...,1807-08-04
2,New_Hampshire_18070804_5,"At Westmoreland, Mrs. Sally Lincoln, wife of M...","At Weltmoreland, Mrs. Sally Liacoln, wife ~of ...",1807-08-04
3,New_Hampshire_18070804_8,Upon the correction of this remedy the stomach...,tion of this remedy the flomach is invariably ...,1807-08-04
4,New_Hampshire_18070804_9,"Also FOR SALE AS ABOVE, NEW GOODS, STEPHEN HAR...","*° , ALSO POR SALE AS ABOVE, o NEW-GEODS, - ST...",1807-08-04
5,New_Hampshire_18070804_13,At a meeting of the committee of the : subscri...,At a meeting of the committee of the : fcfijcr...,1807-08-04
6,New_Hampshire_18070804_14,Notice is hereby given to the proprietors of t...,N OTICE is hereby given to these propri- X eto...,1807-08-04
7,New_Hampshire_18070804_16,"‘ . LO, L, George Frost, Esq. X 30 I 14 2 1 3 ...","‘ . LO, L, Gegrge Frofl, Efg. X 30 ‘I 14 2 1 3...",1807-08-04
8,New_Hampshire_18070804_18,Swedes do. 150 bbls. fresh FLOUR. MOLASSES and...,Swedes do. 150 bbls. freth FLOUR. MOI.ASSES an...,1807-08-04
9,New_Hampshire_18060715_1,The Portsmouthaathinna in the Miscellaneous Re...,P tsmnatahiniinni il z Miscellaneous Repositor...,1806-07-15


In [13]:
docColl['context'].compare(docColl_Norm1['context'])

Unnamed: 0,self,other
2,"At Westmoreland, Mrs. Sally Lincoln, wife of M...","At Westmoreland, Mrs. Sally Lincoln, wife of M..."
6,Notice is hereby given to the proprietors of t...,Notice is hereby given to the proprietors of t...
7,"‘ . LO, L, George Frost, Esq. X 30 I 14 2 1 3 ...","‘ . LO, L, George Frost, Esq. X 30 I 14 2 1 3 ..."
9,The Portsmouthaathinna in the Miscellaneous Re...,The Portsmouthaathinna in the Miscellaneous Re...
16,"ing, dated June 1st, 1892, agreed to sell to ...","ing, dated June 1st, 1892, agreed to sell to J..."
...,...,...
131862,"Hooray!"" Thus Jerome S. McWade, in an after-d...","Hooray!"" Thus Jerome S. McWade, in an after-di..."
131877,This Is what our Lord has provided for all his...,This Is what our Lord has provided for all his...
131880,And there are teachers who are not true teache...,And there are teachers who are not true teache...
131881,Pursuant to a decree entered in the above styl...,Pursuant to a decree entered in the above styl...


In [14]:
print(docColl['context'].iloc[2])
print(docColl_Norm1['context'].iloc[2])

At Westmoreland, Mrs. Sally Lincoln, wife of Mr. Spencer L. aged 28.  At Henrico, Mrs. Polly Adams, consort On Saturday, the 11th ult. Mr. Joseph Meyer, of Hampstead, was found dead in the road, (his horse standing by him) when oy e g e SMITH & RUST Pocket Book Lost.  "LOST last Wednesday between 7 and 8 o’clock in the afternoon, either in the Globe Tavern at the Plains, or on the road leading from thence to Portsmouth, a new Red Morocco Pocket Book ; containing some Money, Notes of hand payable to the Subscriber, also, New Hampshire Fire and Marine Certificates, and other papers valuable to none but to the owner—Whoever shall find said Pocket Book, and re- turn it with its contents, with or without the money shall be handsomely rewarded, and the thanks of their humble servant EDWARD. PARRY.  TO BE LET, That Fireproof Store lately improved by Mr. Benjamin Swett, which must be allowed to be the best stand for business, either for English or West- India Goods in this town—Inquire of EDWA

##### NER
We want to identify named-entities before lemmatizing the text, so that we do not lose any entity by "shrinking" words to their base forms.

In [25]:
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, pipeline
from tqdm import tqdm

# Settings per far runnare su gpu (se possibile)
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = "mps" if torch.backends.mps.is_available() else "cpu"

MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

ner_pipeline = pipeline(
    model=MODEL_NAME,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device=device)

def run_impresso_ner(text_series):
    results = []
    for text in tqdm(text_series): # tqdm per vedere i progressi nelle ore di run
        text_str = str(text)
        if not text_str.strip(): # per testi vuoti
            results.append([])
            continue

        words = text_str.split()

        try:
            entities = ner_pipeline(text_str, tokens=words)
            results.append(entities)
        except Exception as e:
            print(f"Errore su un documento: {e}")
            results.append([]) # per non farlo bloccare se ha un errore
    return results

OUTPUT_FILE = "data/ner_results_cache.parquet"
if os.path.exists(OUTPUT_FILE):
    cached_data = pd.read_parquet(OUTPUT_FILE)

    docColl_Norm1['ner_entities_context'] = cached_data['ner_entities_context']
    docColl_Norm1['ner_entities_ocr'] = cached_data['ner_entities_ocr']

else:
    # context
    docColl_Norm1['ner_entities_context'] = run_impresso_ner(docColl_Norm1['context'])
    # OCR
    docColl_Norm1['ner_entities_ocr'] = run_impresso_ner(docColl_Norm1['raw_ocr'])
    # salvataggio su file esterno
    docColl_Norm1[['ner_entities_context', 'ner_entities_ocr']].to_parquet(OUTPUT_FILE)

Device set to use mps


File 'data/ner_results_cache.parquet' non trovato. Inizio elaborazione NER (operazione lunga)...
Analisi 'context'...


100%|██████████| 131921/131921 [3:57:25<00:00,  9.26it/s]   


Analisi 'raw_ocr'...


100%|██████████| 131921/131921 [3:55:02<00:00,  9.35it/s]   


Fine! I risultati sono stati salvati in 'data/ner_results_cache.parquet'.


In [35]:
docColl_Norm1

Unnamed: 0,para_id,context,raw_ocr,publication_date,ner_entities_context,ner_entities_ocr
0,New_Hampshire_18070804_1,Aiscellaneous Repository. From the Albany Regi...,fAiscellancous Bepogitory. . dvom the Albany R...,1807-08-04,"[{'type': 'org', 'confidence_ner': 0.47, 'inde...","[{'type': 'pers', 'confidence_ner': 0.66, 'ind..."
1,New_Hampshire_18070804_4,Surely he above the rest of his fellow mortals...,Surely he a bove the rest of his fellow mortal...,1807-08-04,"[{'type': 'pers', 'confidence_ner': 0.99, 'ind...","[{'type': 'pers', 'confidence_ner': 0.98, 'ind..."
2,New_Hampshire_18070804_5,"At Westmoreland, Mrs. Sally Lincoln, wife of M...","At Weltmoreland, Mrs. Sally Liacoln, wife ~of ...",1807-08-04,"[{'type': 'loc', 'confidence_ner': 0.96, 'inde...","[{'type': 'loc', 'confidence_ner': 0.9, 'index..."
3,New_Hampshire_18070804_8,Upon the correction of this remedy the stomach...,tion of this remedy the flomach is invariably ...,1807-08-04,"[{'type': 'loc', 'confidence_ner': 0.83, 'inde...","[{'type': 'loc', 'confidence_ner': 0.92, 'inde..."
4,New_Hampshire_18070804_9,"Also FOR SALE AS ABOVE, NEW GOODS, STEPHEN HAR...","*° , ALSO POR SALE AS ABOVE, o NEW-GEODS, - ST...",1807-08-04,"[{'type': 'org', 'confidence_ner': 0.47, 'inde...","[{'type': 'org', 'confidence_ner': 0.38, 'inde..."
...,...,...,...,...,...,...
131916,Nebraska_19130626_7,"""Did you?” said Fran politely. “So father grad...","""Did you?” said Fran politely. “So father grad...",1913-06-26,"[{'type': 'pers', 'confidence_ner': 0.94, 'ind...","[{'type': 'pers', 'confidence_ner': 0.94, 'ind..."
131917,Indiana_19170719_6,"When a boy begins to learn a trade, the ""play ...","When a boy begins to learn a trade, the ""play ...",1917-07-19,"[{'type': 'loc', 'confidence_ner': 0.96, 'inde...","[{'type': 'loc', 'confidence_ner': 0.95, 'inde..."
131918,Kentucky_19110727_5,It is situated in the valley of the great many...,It is ftltuaiod In tho val ley of Uio great ni...,1911-07-27,"[{'type': 'loc', 'confidence_ner': 0.67, 'inde...","[{'type': 'loc', 'confidence_ner': 0.83, 'inde..."
131919,Rhode_Island_19140626_10,"A PRACTICAL LESSON IN AGRICULTURE, MAY 1708, T...","A PRACITICA, ZESSQN 2V AGRICTZ, LEFY 170 RIGHT...",1914-06-26,[],"[{'type': 'org', 'confidence_ner': 0.29, 'inde..."


In [None]:
docColl_ner = docColl_Norm1.copy()
docColl_ner[['context', 'raw_ocr', 'ner_entities_context', 'ner_entities_ocr']]

--> 2nd step normalization

In [None]:
def normalize_text2(text):
    if not isinstance(text, str):
        return text
    text = text.lower() # lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text) # punctuations
    text = re.sub(r'\s+', ' ', text).strip() # white spaces again
    return text

# da testare così,
# se va: cambiare anche normalize_text1
# se non va: scrivere questo apply(normalize_text2) diviso tra context e raw_ocr e poi riunire tutto su un dataframe unico
docColl_Norm2 = docColl_ner[['context', 'raw_ocr']].apply(normalize_text2)
#docColl_ocrNorm2 = docColl_ner['raw_ocr'].apply(normalize_text2)

##### Lemmatization
Placed here to standardize semantically the sentences in the documents

In [None]:
import spacy

try:
    if 'nlp' not in locals():
        nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Qui prendiamo i dati dall'ultimo step (docColl_Norm2)
# Facciamo una copia per creare il nuovo dataframe 'docColl_Lemm'
if 'docColl_Norm2' in locals():
    docColl_Lemm = docColl_Norm2.copy()
    print(" DataFrame sorgente: 'docColl_Norm2' trovato e copiato in 'docColl_Lemm'.")
else:
    print(" Errore: 'docColl_Norm2' non trovato in memoria.")

columns_to_process = ['context', 'raw_ocr']

print(f" Avvio lemmatizzazione sulle colonne: {columns_to_process}")

for col in columns_to_process:
    if col in docColl_Lemm.columns:
        print(f"\n--- Elaborazione colonna: '{col}' ---")
        
        texts = docColl_Lemm[col].astype(str).tolist()
        processed_texts = []

        print(f"Processando {len(texts)} documenti da 'docColl_Norm2'...")

        for doc in nlp.pipe(texts, batch_size=2000, n_process=-1):
            lemmas = [token.lemma_ for token in doc if not token.is_space]
            processed_texts.append(" ".join(lemmas))

        new_col_name = f"{col}_lemma"
        docColl_Lemm[new_col_name] = processed_texts

        print(f" Finito! Creata colonna: {new_col_name}")
    else:
        print(f" Errore: Colonna '{col}' non trovata nel dataframe.")

print("\nDataFrame finale: docColl_Lemm")
print(docColl_Lemm[['context', 'context_lemma', 'raw_ocr', 'raw_ocr_lemma']].head())

##### N-gram based tokenization
Important to place it after normalization, in this tokenization can be integrated a NER-aware part so that "the tokenization is also entity-guided"

In [None]:

# DA INTEGRARE PER FARGLI FARE IL LAVORO ANCHE SULLA COLONNA RAW_OCR
def ner_aware_ngram_tokenizer(row, text_col='lemmatized_context', ner_col='ner_entities', n=2):
    """
    1. Prende il testo lemmatizzato.
    2. Usa le entità NER per 'incollare' le parole composte (New York -> new_york).
    3. Genera N-grams dal testo modificato.
    """
    text = row.get(text_col, "")
    entities = row.get(ner_col, [])
    
    if not isinstance(text, str) or not text.strip():
        return []
    
    # Entity Glueing (Incollaggio Entità)
    # Creiamo una versione del testo dove le entità sono unite da underscore.
    
    # Se abbiamo entità, proviamo a unirle nel testo
    if isinstance(entities, list) and len(entities) > 0:
        # Ordiniamo per lunghezza decrescente per evitare sostituzioni parziali
        try:

            entity_texts = []
            for ent in entities:
                if 'word' in ent:
                    entity_texts.append(ent['word'])
                elif 'entity_group' in ent:
                    entity_texts.append(ent['entity_group'])
                elif 'entity' in ent:
                    entity_texts.append(ent['entity']) 
            
            for ent_text in sorted(entity_texts, key=len, reverse=True):
                clean_ent = ent_text.lower().strip()
                if " " in clean_ent:
                    merged_ent = clean_ent.replace(" ", "_")
                    text = text.replace(clean_ent, merged_merged_ent)
        except Exception as e:
            pass

    # Tokenization Standard ---
    tokens = text.split() 
    
    # Generazione N-grams ---
    if len(tokens) < n:
        return []
        
    # Se n=2 (Bigrams): zip(tokens, tokens[1:])
    n_grams_tuples = zip(*[tokens[i:] for i in range(n)])
    
    # Unisce le tuple in stringhe: ("new_york", "is") -> "new_york is"
    n_grams_list = [" ".join(ngram) for ngram in n_grams_tuples]
    
    return n_grams_list

target_key = 'data/document_collection.json'
text_column = 'lemmatized_context' 
ner_column = 'ner_entities' 

if target_key in dataframes:
    print(f"Initiating N-gram Tokenization (Entity-Aware) on: {target_key}...")
    df = dataframes[target_key]
    
    if text_column in df.columns and ner_column in df.columns:
        
        N_VALUE = 2 
        
        print(f"Generating {N_VALUE}-grams...")
        
        df['ngrams'] = df.apply(
            lambda row: ner_aware_ngram_tokenizer(row, text_col=text_column, ner_col=ner_column, n=N_VALUE), 
            axis=1
        )

        dataframes[target_key] = df
        
        print(df[['lemmatized_context', 'ngrams']].head())
        
    else:
        print(f"Error: Columns '{text_column}' or '{ner_column}' missing. Check names.")
else:
    print(f"Error: {target_key} not found.")

da qui dovrebbe uscire il dataframe chiamato docColl_tok

### _Multi-field Indexing_

In [None]:
from collections import defaultdict

def create_multi_field_index(df):
    # The index structure: { field_name: { term: { doc_id: frequency } } }
    inverted_index = {
        "raw": defaultdict(lambda: defaultdict(int)),
        "clean": defaultdict(lambda: defaultdict(int)),
        "entities": defaultdict(lambda: defaultdict(int))
    }
    
    # Track document frequency (how many docs a term appears in)
    doc_counts = {
        "raw": defaultdict(int),
        "clean": defaultdict(int),
        "entities": defaultdict(int)
    }

    num_docs = len(df)

    for idx, row in df.iterrows():
        doc_id = idx # Using dataframe index as Document ID
        
        # --- Field 1: Raw (from raw_ocr) ---
        raw_tokens = str(row.get('raw_ocr', '')).lower().split()
        for token in raw_tokens:
            inverted_index["raw"][token][doc_id] += 1
            
        # --- Field 2: Clean (from context / lemmatized_context) ---
        clean_tokens = str(row.get('context', '')).lower().split()
        for token in clean_tokens:
            inverted_index["clean"][token][doc_id] += 1
            
        # --- Field 3: Entities (from ner_entities) ---
        # Extracts only the 'word' or 'entity' text from your NER results
        entities_list = row.get('ner_entities', [])
        if isinstance(entities_list, list):
            for ent in entities_list:
                # Handle different key structures found in your screenshots
                ent_text = ent.get('word') or ent.get('entity_group') or ent.get('entity')
                if ent_text:
                    term = ent_text.lower().strip().replace(" ", "_")
                    inverted_index["entities"][term][doc_id] += 1

    return inverted_index, num_docs

# Execute Indexing
df_target = dataframes['data/document_collection.json']
my_index, total_docs = create_multi_field_index(df_target)

--> Indexing con PyTerrier usando un generator

In [None]:
# qui assumiamo che le celle create dal NER siano oggetti di tipo dizionario
def createGenerator(df, context=True):
    # context
    if context:
        for _, row in df.iterrows():
            # togliamo lOffset and rOffset
            clean_ents = []
            for ent in row['ner_entities_context']:
                cleaned = {k: v for k, v in ent.items() if k not in ['lOffset', 'rOffset']}
                clean_ents.append(cleaned)

            search_terms = []
            for e in clean_ents:
                #search_terms.append(e.get('name', ''))
                #search_terms.append(e.get('title', ''))
                # da capire se vogliamo che siano searchable, dato che surface contiene già il testo a cui è associata la entity
                search_terms.append(e.get('surface', ''))

            ent_text = " ".join(filter(None, search_terms)) # questa riga ha senso solo se prendiamo anche 'name' e 'title'
                                                                                           # se no ent_text va assegnato a e.get('surface', ' ')

            meta_json = json.dumps(clean_ents) # facciamo diventare tutti i metadati una stringa in forma json (non un oggetto dizionario, proprio una stringa)

            yield { # serve per lo stream dei dati quando viene chiamata createGenerator dentro indexer.index(•)
                "docno": str(row['para_id']),
                "text": row['context'],
                "entities": ent_text, # entità searchable
                "entity_json": meta_json}
    # OCR
    if not context:
        for _, row in df.iterrows():
            # togliamo lOffset and rOffset
            clean_ents = []
            for ent in row['ner_entities_ocr']:
                cleaned = {k: v for k, v in ent.items() if k not in ['lOffset', 'rOffset']}
                clean_ents.append(cleaned)

            search_terms = []
            for e in clean_ents:
                #search_terms.append(e.get('name', ''))
                #search_terms.append(e.get('title', ''))
                # da capire se vogliamo che siano searchable, dato che surface contiene già il testo a cui è associata la entity
                search_terms.append(e.get('surface', ''))

            ent_text = " ".join(filter(None, search_terms)) # questa riga ha senso solo se prendiamo anche 'name' e 'title'
                                                                                           # se no ent_text va assegnato a e.get('surface', ' ')

            meta_json = json.dumps(clean_ents) # facciamo diventare tutti i metadati una stringa in forma json (non un oggetto dizionario, proprio una stringa)

            yield { # serve per lo stream dei dati quando viene chiamata createGenerator dentro indexer.index(•)
                "docno": str(row['para_id']),
                "text": row['raw_ocr'],
                "entities": ent_text, # entità searchable
                "entity_json": meta_json}

contextIndex_path = 'data/docColl_context-index'
ocrIndex_path = 'data/docColl_ocr-index'

In [None]:
if os.path.exists(contextIndex_path):
    shutil.rmtree(contextIndex_path)

indexerCont = pt.IterDictIndexer(
    'entity_index',
    fields=['text', 'entities'],
    meta={'docno', 'entity_json'})

indexrefCont = indexerCont.index(createGenerator(docColl_tok, context=True))

In [None]:
if os.path.exists(ocrIndex_path):
    shutil.rmtree(ocrIndex_path)

indexerOCR = pt.IterDictIndexer(
    'entity_index',
    fields=['text', 'entities'],
    meta={'docno', 'entity_json'})

indexrefOCR = indexerOCR.index(createGenerator(docColl_tok, context=False))

#### Statistics about the indexed documents

In [None]:
indexCont = pt.IndexFactory.of(indexrefCont)
stats = indexCont.getCollectionStatistics()
print('Index folder:', contextIndex_path)
print('Number of documents:', stats.getNumberOfDocuments())
print('Number of postings:', stats.getNumberOfPostings())
print('Number of tokens:', stats.getNumberOfTokens())
print('Number of unique terms:', stats.getNumberOfUniqueTerms())
print('Average document length:', stats.getAverageDocumentLength())

In [None]:
indexOCR = pt.IndexFactory.of(indexrefOCR)
stats = indexOCR.getCollectionStatistics()
print('Index folder:', contextIndex_path)
print('Number of documents:', stats.getNumberOfDocuments())
print('Number of postings:', stats.getNumberOfPostings())
print('Number of tokens:', stats.getNumberOfTokens())
print('Number of unique terms:', stats.getNumberOfUniqueTerms())
print('Average document length:', stats.getAverageDocumentLength())

#### Query analysis

In [None]:
display(queries.head(10))

--> da scrivere commento riguardo l'analisi delle queries

#### Qrels analysis

In [None]:
display(qrels.sample(10))

In [None]:
# stats for the qrels
# Count how many relevance assessments each query has
counts = qrels.groupby("query_id")["para_id"].count()  # group by query id and count documents
print('Overall Statistics')
print(counts.describe())  # show a summary of the count distribution

import matplotlib.pyplot as plt  # plotting library

# Plot how many relevance assessments each query received
plt.figure()  # create a new figure
counts.plot(kind='hist')  # histogram showing distribution of judgment counts
plt.xlabel('Number of relevance assessments per query')  # label for x-axis
plt.ylabel('Number of queries')  # label for y-axis
plt.title('Relevance assessment distribution')  # title of the plot
plt.show()  # display the plot

# Show the queries with the highest number of relevance assessments
counts.sort_values(ascending=False).head()  # top queries by number of judgments

# Count how many times each relevance label occurs overall
qrels['relevance'].value_counts()  # distribution of relevance scores (e.g., 0, 1, 2, etc.)

# Plot the label distribution as a histogram
plt.figure()  # create a new figure
qrels['relevance'].plot(kind='hist')  # histogram of relevance labels
plt.xlabel('Relevance score')  # label for x-axis
plt.ylabel('Frequency')  # label for y-axis
plt.title('Relevance score distribution')  # title of the plot
plt.show()  # display the plot

--> commento riguardo l'analisi delle qrels

## Phase I - Topical relevance-based retrieval

### **BM25 Retrieval from raw OCR (baseline 1)**

In [None]:
bm25ocr = pt.terrier.Retriever(indexrefOCR, wmodel='BM25', ) # dovremmo usare un BM25F? per dividere i fields di ricerca (secondo me si)
res_bm25ocr = bm25ocr.transform()

### **BM25 Retrieval from corrected OCR (baseline 2)**

### **BM25 Retrieval from both raw and corrected OCR using RRF formula (baseline 3)**