In [4]:
import pandas as pd
import math
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, models
from sentence_transformers.util import cos_sim
import random
import os
import re

########################################
# Config you can tweak
########################################
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
TRAIN_CSV  = "training-data.csv"   # your combined labeled data
OUTPUT_DIR = "finetuned-company-matcher"
BATCH_SIZE = 64
EPOCHS     = 3
LEARNING_RATE = 2e-5
WARMUP_RATIO  = 0.1   # % of total steps used for warmup
VAL_SPLIT     = 0.1   # percent of rows to hold out for eval
SEED          = 42



def normalize_company_name(x: str) -> str:
    if pd.isna(x):
        return ""
    x = str(x)
    x = x.lower()
    # remove punctuation -> space
    x = re.sub(r"[^a-z0-9\s]", " ", x)
    # collapse repeated whitespace
    x = re.sub(r"\s+", " ", x).strip()
    return x

########################################
# 0. Reproducibility
########################################
random.seed(SEED)
torch.manual_seed(SEED)

########################################
# 1. Load data
########################################
df = pd.read_csv(TRAIN_CSV)

# basic safety checks
required_cols = {"sentence1", "sentence2", "similarity"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Your CSV is missing columns: {missing}")

# normalize text the same way you'll normalize at inference time
df["s1_norm"] = df["sentence1"].apply(normalize_company_name)
df["s2_norm"] = df["sentence2"].apply(normalize_company_name)

# shuffle
df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

# train/val split
n_total = len(df)
n_val   = max(1, int(n_total * VAL_SPLIT))
df_val  = df.iloc[:n_val].reset_index(drop=True)
df_train= df.iloc[n_val:].reset_index(drop=True)

print(f"Total rows: {n_total} | Train: {len(df_train)} | Val: {len(df_val)}")

########################################
# 2. Build InputExamples
########################################
train_examples = [
    InputExample(
        texts=[row["s1_norm"], row["s2_norm"]],
        label=float(row["similarity"])
    )
    for _, row in df_train.iterrows()
]

val_sentences1 = df_val["s1_norm"].tolist()
val_sentences2 = df_val["s2_norm"].tolist()
val_scores     = df_val["similarity"].astype(float).tolist()

########################################
# 3. Load base model
########################################
model = SentenceTransformer(MODEL_NAME)

# Optional: you can adjust the pooling or dense layer, but MiniLM defaults are fine.
# We’ll just train it end-to-end with a cosine similarity loss.

########################################
# 4. Loss + Dataloader
########################################
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

train_loss = losses.CosineSimilarityLoss(model)

########################################
# 5. Evaluator for validation
########################################
# This will compute cosine sim(model(s1), model(s2)) and correlate with labels.
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    val_sentences1,
    val_sentences2,
    val_scores,
    main_similarity=evaluation.SimilarityFunction.COSINE
)

########################################
# 6. Training
########################################
num_train_steps = math.ceil(len(train_dataloader) * EPOCHS)
warmup_steps    = math.ceil(num_train_steps * WARMUP_RATIO)

print(f"Training for {EPOCHS} epochs, ~{num_train_steps} steps, warmup {warmup_steps} steps")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=EPOCHS,
    optimizer_params={"lr": LEARNING_RATE},
    warmup_steps=warmup_steps,
    output_path=OUTPUT_DIR,
    show_progress_bar=True,
    use_amp=True  # mixed precision for speed, if GPU supports it; safe on CPU too
)

print(f"Model saved to {OUTPUT_DIR}")


Total rows: 787 | Train: 709 | Val: 78




Training for 3 epochs, ~36 steps, warmup 4 steps


  scaler = torch.cuda.amp.GradScaler()


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/12 [00:00<?, ?it/s]

Iteration:   0%|          | 0/12 [00:00<?, ?it/s]

Iteration:   0%|          | 0/12 [00:00<?, ?it/s]

Model saved to finetuned-company-matcher


In [3]:
df

Unnamed: 0,sentence1,sentence2,similarity
0,Aspire Financial,PUTNAM Financial,0.059
1,Assetmark Financial Partners,Tradewinds Financial Partners,0.079
2,Bigelow Asset Management,TCI Asset Management,0.063
3,Pin Wealth Partners,Safeguard Wealth Partners,0.051
4,Qtrade Wealth Partners,Usmart Wealth Partners,0.073
...,...,...,...
782,University Financial Trust,University Asset Management Group,0.600
783,Safeguard Capital Management,Safeguard Advisory Group,0.600
784,Tavira Financial Trust,Tavira Advisory,0.600
785,Analog Financial Trust,Analog Investment Company,0.600
