# Notebook Purpose

This notebook is what I used to create the dataset:
`dleemiller/irish_penny_journal`

[huggingface link](https://huggingface.co/datasets/dleemiller/irish_penny_journal)

It it chunks and cleans the source text and tries to remove artifacts. Then it uses a basic dspy pipeline to process the text into cleaned and modernized text representations. These are saved progressively as json files, and then converted to parquet once the processing is completed.

This dataset is used to train a classifier. Because the original and modernized texts have the same general meaning, it provides a nice balanced dataset for the style and lexicon to be learned.

In [None]:
import os
from dotenv import load_dotenv
import dspy
import random

# If you use OpenRouter, create a .env file or change this
# to supply it directly
load_dotenv()
api_key = os.environ["APIKEY"]

## Model Rotation
## Pick the ones you want or change them
## You can also use other sources like ollama or whatever
## litellm supports

#BASE_MODEL1 = "openrouter/google/gemini-2.5-flash-preview"
BASE_MODEL2 = "openrouter/google/gemini-2.0-flash-001"
BASE_MODEL3 = "openrouter/meta-llama/llama-4-maverick"
BASE_MODEL4 = "openrouter/qwen/qwen3-235b-a22b"
BASE_MODEL5 = "openrouter/meta-llama/llama-3.3-70b-instruct"

BASE_MODELS = [BASE_MODEL2, BASE_MODEL3, BASE_MODEL4, BASE_MODEL5]

lm_list = [dspy.LM(x, api_key=api_key, temperature=0.8, cache=False, max_tokens=16000) for x in BASE_MODELS]
dspy.configure(lm=lm_list[2])

# DSPy pipeline

Clean and modernize the text, given input from IPJ

In [None]:
class IrishPennyTranslator(dspy.Signature):
    input_text: str = dspy.InputField(description="Text from Irish Penny Journal")
    cleaned_input_text: str = dspy.OutputField(description="Cleaned original text")
    modernized_text: str = dspy.OutputField(description="Translation to modern (US) English")

IrishPennyTranslator.__doc__ = """
You are given a passage of text from the Irish Penny Journal (ca 1840)

Clean the original text (fix whitespace or artifacts from OCR) or formatting issues.
Remove any references to images, pages from the paper. Only retain the prose content.

Then your task is to read the passage, and convert it to modern US English.

In order to do this:
- attempt to retain the original meaning
- rephrase in modern English
- restructure sentences to modern grammar
- apply or translate to modern concepts (eg telegraph -> iphone)

/no_think
"""

Refine pipeline to retry if not cleaned text has enough similarity to the original text.

In [None]:
import textdistance

def reward_fn(text, pred):
    return textdistance.jaccard.similarity(text["input_text"], pred.cleaned_input_text)

penny = dspy.ChainOfThought(IrishPennyTranslator)
best_of_3 = dspy.Refine(module=penny, N=3, reward_fn=reward_fn, threshold=0.9)

## Data Prep

A few generated scripts for processing the data into chunks.

In [None]:
from utils.clean import extract_penny_journal_segments, analyze_segments, print_sample_segments, save_segments_to_file

In [None]:
import glob

texts = []

for filename in glob.glob("./original_texts/*.txt"):
    with open(filename, "r") as fh:
        texts.append(fh.read())

print("🔍 Processing Irish Penny Journal with strict quality filters...")

# Extract high-quality segments
segments = []
for text in texts:
    segments += extract_penny_journal_segments(text, min_words=50, max_words=150)

print(f"✅ Extracted {len(segments)} high-quality prose segments")

# Analyze results
stats = analyze_segments(segments)

print(f"\n📊 Quality Analysis:")
print(f"   Total segments: {stats['total_segments']}")
print(f"   Total words: {stats['total_words']:,}")
print(f"   Average words per segment: {stats['avg_words_per_segment']:.1f}")
print(f"   Word count range: {stats['min_words']} - {stats['max_words']}")

# Distribution analysis
dist = stats['word_count_distribution']
print(f"\n📈 Word Count Distribution:")
print(f"   Under 50 words: {dist['under_50']} segments")
print(f"   50-100 words: {dist['50_to_100']} segments")
print(f"   100-150 words: {dist['100_to_150']} segments")
print(f"   Over 150 words: {dist['over_150']} segments")

# Quality validation
if dist['under_50'] == 0:
    print(f"   ✅ Perfect: No segments under minimum threshold")
elif dist['under_50'] < len(segments) * 0.05:
    print(f"   ✅ Excellent: Less than 5% under threshold")
else:
    print(f"   ⚠️  Warning: {dist['under_50']} segments under 50 words")

# Show sample segments
print_sample_segments(segments)

# Save to file
save_segments_to_file(segments)
print(f"\n💾 Segments saved to 'penny_journal_segments.txt'")

# Projection for full dataset
print(f"\n🔮 Measurement for 52 issues:")
projected_segments = len(segments)
projected_words = stats['total_words']

print(f"   Segments: {projected_segments:,}")
print(f"   Words: {projected_words:,}")

if projected_segments >= 2500:
    print(f"   ✅ Excellent dataset size for style classification!")
elif projected_segments >= 1500:
    print(f"   ✅ Good dataset size for style classification")
else:
    print(f"   ⚠️  Consider adding other penny journals for larger dataset")

print(f"\n🎯 This dataset should work excellently for training your penny journal style classifier!")

## Test

Check the pipeline works as expected

In [None]:
best_of_3(input_text=segments[0]["text"])

## Dataset

Generate the dataset as individual json files. Hash the text and use it as a filename.

In [None]:
import hashlib

texts = [x["text"] for x in segments]
ids = [hashlib.md5(x.encode()).hexdigest() for x in texts]

In [None]:
import os
import json
import hashlib


JSON_DIR = f"{DATA_DIR}/json/"

os.makedirs(DATA_DIR, exist_ok=True)
def hash_filename(text):
    """
    Hash the text to use as a filename.
    """
    text_hash = hashlib.md5(text.encode()).hexdigest()
    return os.path.join(JSON_DIR, f"{text_hash}.json")

def save_json(idx, model, text, pred):
    data = {
        "text": text,
        "cleaned_text": pred.cleaned_input_text,
        "modernized_text": pred.modernized_text,
        "thinking": pred.reasoning,
        "generator_model": model,
        "id": idx,
        "source": "Irish Penny Journal",
        "jaccard_similarity":  textdistance.jaccard.similarity(text, pred.cleaned_input_text)
        
    }
    with open(hash_filename(text), "w") as fh:
        json.dump(data, fh)

## Data Generation Loop

This step takes a while. This could probably be parallelized with something like `dspy.Parallel`... Or you can just let it go overnight :)

In [None]:
for i,text in enumerate(texts):
    if os.path.exists(hash_filename(text)):
        print(f"Skipping... {hash_filename(text)}")
        continue

    l = random.choice(range(len(BASE_MODELS)))
    print(f"Setting {BASE_MODELS[l]}")
    best_of_3.set_lm(lm_list[l])

    pred = best_of_3(input_text=text)
    print(pred)
    save_json(ids[i], BASE_MODELS[l], text, pred)

# Create parquet

Load the json and convert it to a parquet file

In [None]:
import glob
import pandas as pd
import json

records = []
for file in glob.glob(os.path.join(DATA_DIR, "*.json")):
    with open(file, "r") as fh:
        records.append(json.load(fh))

df = pd.DataFrame(records)
df = df.sample(frac=1).reset_index(drop=True).drop(columns=["thinking"])
df.generator_model = df.generator_model.apply(lambda x: x.lstrip("openrouter/"))
df.head()

In [None]:
df[["text", "cleaned_text", "modernized_text", "generator_model", "jaccard_similarity", "id", "source"]].to_parquet("irish-penny.parquet", index=False)