In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments, BatchSamplers
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.evaluation import TripletEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_json("chunked-docs/semantic_chunks_combined.jsonl", lines=True)
texts = df["chunk_text"].tolist()

print(f"Total texts loaded: {len(texts)}")

# Train-test split (90-10)
train_texts, val_texts = train_test_split(texts, test_size=0.1, random_state=42)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Total texts loaded: 67
Training samples: 60
Validation samples: 7


In [3]:
train_data = {
    "anchor": train_texts,
    "positive": train_texts,  # In production, use actual positive pairs
}
train_dataset = Dataset.from_dict(train_data)

# Create validation dataset
val_data = {
    "anchor": val_texts,
    "positive": val_texts,
}
val_dataset = Dataset.from_dict(val_data)

print(f"Train dataset: {train_dataset}")
print(f"Val dataset: {val_dataset}")


Train dataset: Dataset({
    features: ['anchor', 'positive'],
    num_rows: 60
})
Val dataset: Dataset({
    features: ['anchor', 'positive'],
    num_rows: 7
})


In [4]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
print(f"Model loaded: {model_name}")

Model loaded: sentence-transformers/all-MiniLM-L6-v2


In [5]:
loss = MultipleNegativesRankingLoss(model)

In [6]:
args = SentenceTransformerTrainingArguments(
    # Required: output directory
    output_dir="models/miniLM_finetuned",
    
    # Training parameters
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    
    # GPU optimization (adjust based on your GPU)
    fp16=True,  # Set to False if your GPU can't handle FP16
    bf16=False,  # Set to True if your GPU supports BF16
    
    # Use NO_DUPLICATES for losses with in-batch negatives
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    
    # Evaluation & saving
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,  # Keep only 2 best checkpoints
    
    # Logging
    logging_steps=10,
    logging_dir="./logs",
    run_name="miniLM-semantic-chunks",
)

In [13]:
if len(val_texts) >= 30:
    evaluator = TripletEvaluator(
        anchors=val_texts[:10],
        positives=val_texts[:10],
        negatives=val_texts[10:20],
        name="val-triplets",
    )
else:
    evaluator = None
    print("Not enough validation samples for evaluator (need >=30)")

Not enough validation samples for evaluator (need >=30)


In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset if evaluator else None,
    loss=loss,
    evaluator=evaluator,
)

ValueError: You have set `args.eval_strategy` to IntervalStrategy.STEPS, but you didn't provide an `eval_dataset` or an `evaluator`. Either provide an `eval_dataset` or an `evaluator` to `SentenceTransformerTrainer`, or set `args.eval_strategy='no'` to skip evaluation.

In [None]:
print("Starting training...")
trainer.train()
print("Training complete!")


In [None]:
model.save_pretrained("models/miniLM_finetuned/final")
print("Model saved to: models/miniLM_finetuned/final")