In [29]:
# 1. Setup
import pandas as pd
import json
import random
from pathlib import Path

# Paths
BASE_DIR = Path().resolve().parent.parent  # normalization_service
RAW_DIR = BASE_DIR / "raw_data"
PROC_DIR = BASE_DIR / "finetuning" / "data" / "processed"
# PROC_DIR.mkdir(parents=True, exist_ok=True)

alt_titles_path = RAW_DIR / "Alternate_Titles.csv"
occupations_path = RAW_DIR / "All_Occupations.csv"
eval_path = BASE_DIR / "eval" / "eval_dataset.json"

# Output files
train_output = PROC_DIR / "train.jsonl"
test_output = PROC_DIR / "test.jsonl"

In [30]:
# 2. Load Raw Data
# Alternate Titles
alt_df = pd.read_csv(alt_titles_path, sep="\t", dtype=str)
alt_df = alt_df.rename(
    columns={"O*NET-SOC Code": "code", "Alternate Title": "alt_title"}
)

# Occupations
occ_df = pd.read_csv(occupations_path, dtype=str)
occ_df = occ_df.rename(columns={"Code": "code", "Occupation": "canonical_title"})

print(f"Alternate titles: {len(alt_df)} rows")
print(f"Occupations: {len(occ_df)} rows")

Alternate titles: 60511 rows
Occupations: 1016 rows


In [31]:
merged = alt_df.merge(occ_df[["code", "canonical_title"]], on="code", how="inner")
print(f"Before dedup: {len(merged)} rows")

deduped = (
    merged.groupby("alt_title", group_keys=False)
    .apply(lambda x: x.sample(1, random_state=42))
    .reset_index(drop=True)
)

print(f"After dedup: {len(deduped)} rows (unique alt titles)")
deduped.sample(100)

Before dedup: 50519 rows
After dedup: 40030 rows (unique alt titles)


  .apply(lambda x: x.sample(1, random_state=42))


Unnamed: 0,code,alt_title,Short Title,Source(s),canonical_title
14145,19-1013.00,Field Research Assistant,,10,Soil and Plant Scientists
15531,45-2041.00,Fruit Checker,,0406,"Graders and Sorters, Agricultural Products"
3670,51-9111.00,Blister Pack Operator,,0406,Packaging and Filling Machine Operators and Te...
20788,51-9199.00,Leather Stretcher,,0406,"Production Workers, All Other"
1888,25-1121.00,Associate Music Professor,,02,"Art, Drama, and Music Teachers, Postsecondary"
...,...,...,...,...,...
22386,17-1021.00,Mapping Manager,,02,Cartographers and Photogrammetrists
39247,39-9031.00,Wellness Coach,,09,Exercise Trainers and Group Fitness Instructors
15690,27-1022.00,Fur Remodeler,,0406,Fashion Designers
28095,49-2022.00,Premises Technician,,10,Telecommunications Equipment Installers and Re...


In [32]:
# 4. Load Eval Data and Filter
with open(eval_path, "r") as f:
    eval_data = json.load(f)

eval_titles = set()
for item in eval_data:
    eval_titles.add(item["input_title"].lower())

# Filter out pairs where alt_title or canonical_title appears in eval
filtered = deduped[~deduped["alt_title"].str.lower().isin(eval_titles)]

print(
    f"Filtered rows: {len(filtered)} (removed {len(deduped) - len(filtered)} due to eval overlap)"
)

deduped[deduped["alt_title"].str.lower().isin(eval_titles)]

Filtered rows: 39915 (removed 115 due to eval overlap)


Unnamed: 0,code,alt_title,Short Title,Source(s),canonical_title
82,21-1012.00,Academic Advisor,,0209,"Educational, Guidance, and Career Counselors a..."
131,13-2031.00,Accountant,,02,Budget Analysts
259,41-9012.00,Actor,,02,Models
319,25-1123.00,Adjunct Professor,,02,"English Language and Literature Teachers, Post..."
2006,23-1011.00,Attorney,,020406,Lawyers
...,...,...,...,...,...
39142,13-1121.00,Wedding Planner,,04,"Meeting, Convention, and Event Planners"
39190,49-9012.00,Welder,,02,"Control and Valve Installers and Repairers, Ex..."
39397,19-1023.00,Wildlife Biologist,,0204,Zoologists and Wildlife Biologists
39972,39-9031.00,Yoga Instructor,,020409,Exercise Trainers and Group Fitness Instructors


In [33]:
# 5. Build Positive Pairs
positive_pairs = [
    {"texts": [row.alt_title.strip(), row.canonical_title.strip()], "label": 1.0}
    for row in filtered.itertuples()
]
print(f"Positive pairs: {len(positive_pairs)}")

Positive pairs: 39915


In [34]:
# 6. Build Negative Pairs
canonical_titles = list(filtered["canonical_title"].unique())
negative_pairs = []

for row in filtered.itertuples():
    alt = row.alt_title.strip()
    # Pick a random canonical title that is NOT the correct one
    neg = random.choice(canonical_titles)
    while neg == row.canonical_title:
        neg = random.choice(canonical_titles)
    negative_pairs.append({"texts": [alt, neg], "label": 0.0})

print(f"Negative pairs: {len(negative_pairs)}")

Negative pairs: 39915


In [36]:
# 7. Combine, Shuffle, Downsample, Split, and Save
dataset = positive_pairs + negative_pairs
random.shuffle(dataset)

# Optional: Downsample to speed up fine-tuning
SAMPLE_SIZE = 10000  # adjust (e.g., 5000 for even faster)
if len(dataset) > SAMPLE_SIZE:
    dataset = random.sample(dataset, SAMPLE_SIZE)

print(f"Final dataset size after downsampling: {len(dataset)} examples")

# Train/test split
split_idx = int(0.9 * len(dataset))
train_data = dataset[:split_idx]
test_data = dataset[split_idx:]


# Save JSONL
def save_jsonl(path, data):
    with open(path, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")


save_jsonl(train_output, train_data)
save_jsonl(test_output, test_data)

print(f"Saved {len(train_data)} train examples to {train_output}")
print(f"Saved {len(test_data)} test examples to {test_output}")

Final dataset size after downsampling: 10000 examples
Saved 9000 train examples to /Users/devinhelgeson/code/normalization_service/finetuning/data/processed/train.jsonl
Saved 1000 test examples to /Users/devinhelgeson/code/normalization_service/finetuning/data/processed/test.jsonl
