In [None]:
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import torch

# Paths
BASE_DIR = Path().resolve().parent.parent  # normalization_service
PROC_DIR = BASE_DIR / "finetuning" / "data" / "processed"
train_path = PROC_DIR / "train.jsonl"
test_path = PROC_DIR / "test.jsonl"
MODEL_SAVE_PATH = BASE_DIR / "finetuning" / "models" / "job_title_finetuned"
MODEL_SAVE_PATH.mkdir(parents=True, exist_ok=True)

# Device (force CPU since Mac Intel)
device = torch.device("cpu")

print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [None]:
def load_dataset(path):
    examples = []
    with open(path, "r") as f:
        for line in f:
            item = json.loads(line)
            examples.append(
                InputExample(texts=item["texts"], label=float(item["label"]))
            )
    return examples


train_examples = load_dataset(train_path)
test_examples = load_dataset(test_path)

print(f"Train examples: {len(train_examples)}")
print(f"Test examples: {len(test_examples)}")

# Show a sample
for ex in train_examples[:5]:
    print(ex.texts, ex.label)

Train examples: 9000
Test examples: 1000
['Postal Mail Carrier', 'Postal Service Mail Carriers'] 1.0
['Knit Goods Mender', 'Coroners'] 0.0
['Appliance Assembler', 'Plasterers and Stucco Masons'] 0.0
['Leather Craftsman', 'Shoe and Leather Workers and Repairers'] 1.0
['Turnstile Collector', 'Civil Engineers'] 0.0


In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device=device)
print(f"Loaded model: {model_name}")

Loaded model: sentence-transformers/all-MiniLM-L6-v2


In [None]:
BATCH_SIZE = 8
EPOCHS = 2

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.CosineSimilarityLoss(model)

# Evaluator on test set
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    test_examples, name="job-title-eval"
)

print(f"Dataloader batches: {len(train_dataloader)}")

Dataloader batches: 1125


In [None]:
WARMUP_STEPS = int(len(train_dataloader) * EPOCHS * 0.1)

print("Starting fine-tuning...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,
    output_path=str(MODEL_SAVE_PATH),
    evaluation_steps=1000,
    use_amp=False,  # No mixed precision on CPU
)

print(f"Model saved at: {MODEL_SAVE_PATH}")

Starting fine-tuning...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
                                                                             

Step,Training Loss,Validation Loss,Job-title-eval Pearson Cosine,Job-title-eval Spearman Cosine
1000,0.105,No log,0.816626,0.795058
1125,0.105,No log,0.810985,0.791026
2000,0.081,No log,0.821454,0.798121
2250,0.081,No log,0.821765,0.798163


Model saved at: /Users/devinhelgeson/code/normalization_service/finetuning/models/job_title_finetuned


In [None]:
# Quick Test on Sample Pairs
fine_tuned_model = SentenceTransformer(str(MODEL_SAVE_PATH), device=device)

# Example: compute similarity
query = "java programmer"
candidate = "computer programmers"
emb1 = fine_tuned_model.encode(query)
emb2 = fine_tuned_model.encode(candidate)

from numpy import dot
from numpy.linalg import norm

similarity = dot(emb1, emb2) / (norm(emb1) * norm(emb2))
print(f"Similarity between '{query}' and '{candidate}': {similarity:.4f}")

Similarity between 'java programmer' and 'computer programmers': 0.8983
