In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import math
import random
import os
from datetime import datetime
from datasets import Dataset

Adding Configurations for Retiever(distilroberta) model

In [None]:
# Configuration

# Configuration
MODEL_NAME = 'all-distilroberta-v1'
TRIPLET_FILE_PATH = 'optimized_triplets_ma.csv'
OUTPUT_MODEL_PATH = f'output/finetuned-{MODEL_NAME.replace("/", "-")}-{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'

# Training Hyperparameters
NUM_EPOCHS = 3
TRAIN_BATCH_SIZE = 16
TRIPLET_MARGIN = 5
LEARNING_RATE = 2e-5
WARMUP_STEPS_RATIO = 0.1
USE_AMP = True

# Evaluation Configuration
VALIDATION_SET_FRACTION = 0.1
EVALUATION_STEPS = 100

In [None]:
# Load and Prepare Data

df = pd.read_csv(TRIPLET_FILE_PATH)
df.dropna(subset=['anchor', 'positive', 'negative'], inplace=True)

# Creating InputExamples
train_samples = []
for index, row in df.iterrows():
    anchor = str(row['anchor'])
    positive = str(row['positive'])
    negative = str(row['negative'])
    train_samples.append(InputExample(texts=[anchor, positive, negative]))

In [None]:
# Split Data (Train/Validation)

random.shuffle(train_samples)

# Calculate split index
num_samples = len(train_samples)
num_validation = math.ceil(num_samples * VALIDATION_SET_FRACTION)
num_train = num_samples - num_validation

train_dataset = train_samples[:num_train]
validation_dataset = train_samples[num_train:]


print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(validation_dataset)}")

Training samples: 14281
Validation samples: 1587


In [None]:
# Create DataLoader

# Creating training DataLoader with batch size
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE
)

#Load Pre-trained Model

model = SentenceTransformer(MODEL_NAME)

#Define Loss Function
# TripletLoss takes anchor, positive, negative samples
train_loss = losses.TripletLoss(
    model=model,
    distance_metric=losses.TripletDistanceMetric.COSINE,
    triplet_margin=TRIPLET_MARGIN
)
print(f"Using TripletLoss with margin {TRIPLET_MARGIN} and COSINE distance.")

# Setting up evaluator

if validation_dataset:
    validation_evaluator = evaluation.TripletEvaluator.from_input_examples(
        validation_dataset,
        name=f'{MODEL_NAME.replace("/", "-")}-val'
    )
    print("Validation evaluator created.")
else:
    validation_evaluator = None
    print("No validation data, skipping evaluator setup.")

Using TripletLoss with margin 5 and COSINE distance.
Validation evaluator created.


In [None]:
# Fine-tune the Model

# Calculate total training steps and warmup steps
total_steps = math.ceil(len(train_dataloader) * NUM_EPOCHS)
warmup_steps = math.ceil(total_steps * WARMUP_STEPS_RATIO)
print(f"\nTotal training steps: {total_steps}")
print(f"Warmup steps: {warmup_steps}")

print(f"\nStarting fine-tuning for {NUM_EPOCHS} epochs...")

os.environ["WANDB_DISABLED"] = "true"
os.makedirs(OUTPUT_MODEL_PATH, exist_ok=True)

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=NUM_EPOCHS,
    warmup_steps=warmup_steps,
    evaluator=validation_evaluator,
    evaluation_steps=EVALUATION_STEPS if validation_evaluator else 0,
    output_path=OUTPUT_MODEL_PATH,
    save_best_model=True if validation_evaluator else False,
    optimizer_params={'lr': LEARNING_RATE},
    use_amp=USE_AMP
)

print(f"\n--- Training Complete ---")
print(f"Fine-tuned model saved to: {OUTPUT_MODEL_PATH}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Total training steps: 2679
Warmup steps: 268

Starting fine-tuning for 3 epochs...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,All-distilroberta-v1-val Cosine Accuracy
100,No log,No log,0.856963
200,No log,No log,0.855072
300,No log,No log,0.84436
400,No log,No log,0.84373
500,4.329500,No log,0.860113
600,4.329500,No log,0.856963
700,4.329500,No log,0.855703
800,4.329500,No log,0.855703
893,4.329500,No log,0.856963
900,4.329500,No log,0.856963



--- Training Complete ---
Fine-tuned model saved to: output/finetuned-all-distilroberta-v1-2025-04-23_03-43-57


In [31]:
#Testing the fine-tuned model
from sentence_transformers.util import cos_sim


model_fine_tuned = SentenceTransformer(OUTPUT_MODEL_PATH)
address1 = "346, Franklin Street, Unit d, Worcester, Worcester County, Massachusetts, 01604"
address2 = "9, Frank Street, Worcester, Worcester County, Massachusetts, 01604"
address3 = "6, Evergreen Drive, Unit 603, Middleborough, Plymouth County, Massachusetts, 02346"
embeddings = model_fine_tuned.encode([address1, address2, address3])


similarity_1_2 = cos_sim(embeddings[0], embeddings[1])
similarity_1_3 = cos_sim(embeddings[0], embeddings[2])
print(f"Similarity between '{address1}'\n and '{address2}': {similarity_1_2.item():.4f}\n")
print(f"Similarity between '{address1}'\n and '{address3}': {similarity_1_3.item():.4f}")

Similarity between '346, Franklin Street, Unit d, Worcester, Worcester County, Massachusetts, 01604'
 and '9, Frank Street, Worcester, Worcester County, Massachusetts, 01604': 0.9999

Similarity between '346, Franklin Street, Unit d, Worcester, Worcester County, Massachusetts, 01604'
 and '6, Evergreen Drive, Unit 603, Middleborough, Plymouth County, Massachusetts, 02346': -0.9983
