In [1]:
import psycopg as pg
from datasets import load_dataset, DatasetDict, Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from peft import LoraConfig, TaskType

import random
import pandas as pd
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
with open("cleaned_base_data.json", "r") as f:
    data = json.loads(f.read())

negatives: pd.DataFrame = pd.read_csv("negatives.csv")["review_text"]

total = 0
for id, relevant_docs in data["relevant_docs"].items():
    total += len(relevant_docs)

triplets = []
with tqdm(total=total, desc="creating positive negative pairs") as pbar:
    for query_id, doc_ids in data["relevant_docs"].items():
        anchor = data["query"][query_id]
        for id in doc_ids:
            triplets.append(
                {
                    "anchor": anchor,
                    "positive": data["corpus"][id],
                    "negative": negatives.sample().values[0],
                }
            )
            pbar.update(1)

creating positive negative pairs: 100%|██████████| 5927/5927 [00:02<00:00, 2001.58it/s]


In [3]:
train_triplet, val_triplet = train_test_split(pd.DataFrame(triplets), test_size=0.2)
train_triplet, test_triplet = train_test_split(
    pd.DataFrame(train_triplet), test_size=0.2
)

train_triplet.to_json("triplet_data_train.json")
test_triplet.to_json("triplet_data_test.json")
val_triplet.to_json("triplet_data_val.json")

dataset: DatasetDict = {
    "train": Dataset.from_pandas(train_triplet, preserve_index=False),
    "test": Dataset.from_pandas(test_triplet, preserve_index=False),
    "validation": Dataset.from_pandas(val_triplet, preserve_index=False),
}

In [4]:
model_name = "intfloat/multilingual-e5-small"
model = SentenceTransformer(model_name)
"""
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    target_modules=["query", "key", "value", "dense"],
    lora_dropout=0.1,
)
model.add_adapter(peft_config)
"""

'\npeft_config = LoraConfig(\n    task_type=TaskType.FEATURE_EXTRACTION,\n    inference_mode=False,\n    r=8,\n    lora_alpha=32,\n    target_modules=["query", "key", "value", "dense"],\n    lora_dropout=0.1,\n)\nmodel.add_adapter(peft_config)\n'

In [5]:
trainable_params = 0
all_params = 0

print("Trainable LoRA parameters:")

for name, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        # print(f"{name}: shape={param.shape}, params={param.numel()}")

print("-" * 50)
print(f"Total trainable parameters: {trainable_params:,}")
print(f"Total model parameters:    {all_params:,}")
print(f"Percentage trainable:     {100 * trainable_params / all_params:.2f}%")

Trainable LoRA parameters:
--------------------------------------------------
Total trainable parameters: 117,653,760
Total model parameters:    117,653,760
Percentage trainable:     100.00%


In [6]:
dev_evaluator = TripletEvaluator(
    anchors=dataset["test"]["anchor"],
    positives=dataset["test"]["positive"],
    negatives=dataset["test"]["negative"],
    name="dev_evaluator",
)

In [7]:
dev_evaluator(model)

{'dev_evaluator_cosine_accuracy': 0.9146469831466675}

In [8]:
def is_hit(query: str, expected_context: str, model: SentenceTransformer, k: int):
    embedded_query = model.encode(query).tolist()

    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()

    cur.execute(
        "SELECT context FROM embeddings ORDER BY embedding <=> %s::vector LIMIT %s;",
        (str(embedded_query), str(k)),
    )

    results = [row[0] for row in cur.fetchall()]

    cur.close()
    conn.close()

    return 1.0 if expected_context in results[:k] else 0.0

def embed_documents(speech, context, embedding):
    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO embeddings (speech, context, embedding) VALUES (%s, %s, %s)",
        (speech, context, str(embedding)),
    )
    cur.close()
    conn.close()

with tqdm(total = len(data["corpus"].keys()), desc="Saving embeddings") as pbar:
    for id, context in data["corpus"].items():
        speech_name = data["related_speech"][id]
        embedding = model.encode(context).tolist()
        embed_documents(speech_name, context, embedding)
        pbar.update(1)

In [9]:
recall_10 = (
    test_triplet.apply(
        lambda x: is_hit(x["anchor"], x["positive"], model, 10), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric: ", recall_10)
recall_4 = (
    test_triplet.apply(
        lambda x: is_hit(x["anchor"], x["positive"], model, 4), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@4 Metric: ", recall_4)

Recall@10 Metric:  0.3793466807165437
Recall@4 Metric:  0.2665964172813488


In [10]:
loss = MultipleNegativesRankingLoss(model)

In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/multilingual-e5-small-finetune-danish-subject",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=25,
    logging_first_step=True,
)

In [12]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 17.89 GB, other allocations: 18.06 GB, max allowed: 36.27 GB). Tried to allocate 366.27 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
recall_10 = (
    test_triplet.apply(
        lambda x: is_hit(x["anchor"], x["positive"], model, 10), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric: ", recall_10)
recall_4 = (
    test_triplet.apply(
        lambda x: is_hit(x["anchor"], x["positive"], model, 4), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@4 Metric: ", recall_4)

In [None]:
recall_10 = (
    val_triplet.apply(
        lambda x: is_hit(x["anchor"], x["positive"], model, 10), axis=1
    ).sum()
    / val_triplet.shape[0]
)
print("Recall@10 Metric: ", recall_10)
recall_4 = (
    val_triplet.apply(
        lambda x: is_hit(x["anchor"], x["positive"], model, 4), axis=1
    ).sum()
    / val_triplet.shape[0]
)
print("Recall@4 Metric: ", recall_4)