In [1]:
import psycopg as pg
from datasets import load_dataset, DatasetDict, Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from peft import LoraConfig, TaskType

import random
import pandas as pd
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
with open("cleaned_base_data.json", "r") as f:
    data = json.loads(f.read())

negatives: pd.DataFrame = pd.read_csv("negatives.csv")["review_text"]

total = 0
for id, relevant_docs in data["relevant_docs"].items():
    total += len(relevant_docs)

triplets = []
with tqdm(total=total, desc="creating positive negative pairs") as pbar:
    for query_id, doc_ids in data["relevant_docs"].items():
        anchor = data["query"][query_id]
        for id in doc_ids:
            triplets.append(
                {
                    "anchor": anchor,
                    "positive": data["corpus"][id],
                    "negative": negatives.sample().values[0],
                }
            )
            pbar.update(1)

creating positive negative pairs: 100%|██████████████████████| 5927/5927 [00:02<00:00, 2111.60it/s]


In [3]:
data["query_lk"] = {}
for key, value in data["query"].items():
    data["query_lk"][value] = key

In [4]:
train_triplet, val_triplet = train_test_split(pd.DataFrame(triplets), test_size=0.2)
train_triplet, test_triplet = train_test_split(
    pd.DataFrame(train_triplet), test_size=0.2
)

train_triplet.to_json("triplet_data_train.json")
test_triplet.to_json("triplet_data_test.json")
val_triplet.to_json("triplet_data_val.json")

dataset: DatasetDict = {
    "train": Dataset.from_pandas(train_triplet, preserve_index=False),
    "test": Dataset.from_pandas(test_triplet, preserve_index=False),
    "validation": Dataset.from_pandas(val_triplet, preserve_index=False),
}

In [5]:
model_name = "intfloat/multilingual-e5-small"
model = SentenceTransformer(model_name)

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    target_modules=["query", "key", "value", "dense"],
    lora_dropout=0.1,
)
model.add_adapter(peft_config)

In [6]:
trainable_params = 0
all_params = 0

print("Trainable LoRA parameters:")

for name, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        # print(f"{name}: shape={param.shape}, params={param.numel()}")

print("-" * 50)
print(f"Total trainable parameters: {trainable_params:,}")
print(f"Total model parameters:    {all_params:,}")
print(f"Percentage trainable:     {100 * trainable_params / all_params:.2f}%")

Trainable LoRA parameters:
--------------------------------------------------
Total trainable parameters: 669,696
Total model parameters:    118,323,456
Percentage trainable:     0.57%


In [7]:
dev_evaluator = TripletEvaluator(
    anchors=dataset["test"]["anchor"],
    positives=dataset["test"]["positive"],
    negatives=dataset["test"]["negative"],
    name="dev_evaluator",
)

In [8]:
dev_evaluator(model)

{'dev_evaluator_cosine_accuracy': 0.9241306781768799}

In [13]:
def is_hit(query: str, model: SentenceTransformer, k: int, data: dict):
    query_id = data["query_lk"][query]
    expected_ids = data["relevant_docs"][query_id]
    embedded_query = model.encode(query).tolist()

    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()

    cur.execute(
        "SELECT context_id FROM embeddings ORDER BY embedding <=> %s::vector LIMIT %s;",
        (str(embedded_query), str(k)),
    )

    results = [row[0] for row in cur.fetchall()]

    cur.close()
    conn.close()

    return 1.0 if set(results) & set(expected_ids) else 0.0


def is_hit_base(query: str, model: SentenceTransformer, k: int, data: dict):
    query_id = data["query_lk"][query]
    expected_ids = data["relevant_docs"][query_id]
    embedded_query = model.encode(query).tolist()

    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()

    cur.execute(
        "SELECT context_id FROM embeddings_base ORDER BY embedding <=> %s::vector LIMIT %s;",
        (str(embedded_query), str(k)),
    )

    results = [row[0] for row in cur.fetchall()]

    cur.close()
    conn.close()

    return 1.0 if set(results) & set(expected_ids) else 0.0


def percentage_hits_base(query: str, model: SentenceTransformer, k: int, data: dict):
    query_id = data["query_lk"][query]
    expected_ids = data["relevant_docs"][query_id]
    embedded_query = model.encode(query).tolist()

    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()

    cur.execute(
        "SELECT context_id FROM embeddings_base ORDER BY embedding <=> %s::vector LIMIT %s;",
        (str(embedded_query), str(k)),
    )

    results = [row[0] for row in cur.fetchall()]

    cur.close()
    conn.close()

    min_res = min(len(expected_ids), k)

    return len(set(results) & set(expected_ids)) / min_res


def percentage_hits(query: str, model: SentenceTransformer, k: int, data: dict):
    query_id = data["query_lk"][query]
    expected_ids = data["relevant_docs"][query_id]
    embedded_query = model.encode(query).tolist()

    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()

    cur.execute(
        "SELECT context_id FROM embeddings ORDER BY embedding <=> %s::vector LIMIT %s;",
        (str(embedded_query), str(k)),
    )

    results = [row[0] for row in cur.fetchall()]

    cur.close()
    conn.close()

    min_res = min(len(expected_ids), k)

    return len(set(results) & set(expected_ids)) / min_res

In [10]:
def embed_documents(speech, context_id, context, embedding):
    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO embeddings (speech, context_id, context, embedding) VALUES (%s, %s, %s, %s)",
        (speech, context_id, context, str(embedding)),
    )
    cur.close()
    conn.close()


def embed_documents_base(speech, context_id, context, embedding):
    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO embeddings_base (speech, context_id, context, embedding) VALUES (%s, %s, %s, %s)",
        (speech, context_id, context, str(embedding)),
    )
    cur.close()
    conn.close()


with tqdm(total=len(data["corpus"].keys()), desc="Saving embeddings") as pbar:
    for id, context in data["corpus"].items():
        speech_name = data["related_speech"][id]
        embedding = model.encode(context).tolist()
        embed_documents_base(speech_name, id, context, embedding)
        pbar.update(1)

Saving embeddings: 100%|███████████████████████████████████████| 1173/1173 [02:11<00:00,  8.89it/s]


In [14]:
recall_10 = (
    test_triplet.apply(
        lambda x: is_hit_base(x["anchor"], model, 10, data), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric: ", recall_10)
recall_4 = (
    test_triplet.apply(lambda x: is_hit_base(x["anchor"], model, 4, data), axis=1).sum()
    / test_triplet.shape[0]
)
print("Recall@4 Metric: ", recall_4)
recall_10 = (
    test_triplet.apply(
        lambda x: percentage_hits_base(x["anchor"], model, 10, data), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric %: ", recall_10)
recall_4 = (
    test_triplet.apply(
        lambda x: percentage_hits_base(x["anchor"], model, 4, data), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@4 Metric %: ", recall_4)

Recall@10 Metric:  0.38883034773445735
Recall@4 Metric:  0.2792413066385669
Recall@10 Metric %:  0.3675125863482028
Recall@4 Metric %:  0.26536705303828595


In [15]:
loss = MultipleNegativesRankingLoss(model)

In [16]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/multilingual-e5-small-finetune-danish-subject",
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    bf16=True,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=25,
    logging_first_step=True,
)

In [17]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    loss=loss,
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
100,2.1483,1.999138
200,1.4389,1.16166
300,1.0053,0.738626
400,0.8179,0.688838
500,0.761,0.65504
600,0.7138,0.646488
700,0.7426,0.631984
800,0.6927,0.625631
900,0.6812,0.614663
1000,0.6691,0.610578


TrainOutput(global_step=2370, training_loss=0.7815563976513182, metrics={'train_runtime': 5394.3691, 'train_samples_per_second': 3.515, 'train_steps_per_second': 0.439, 'total_flos': 0.0, 'train_loss': 0.7815563976513182, 'epoch': 5.0})

In [18]:
with tqdm(total=len(data["corpus"].keys()), desc="Saving embeddings") as pbar:
    for id, context in data["corpus"].items():
        speech_name = data["related_speech"][id]
        embedding = model.encode(context).tolist()
        embed_documents(speech_name, id, context, embedding)
        pbar.update(1)

Saving embeddings: 100%|███████████████████████████████████████| 1173/1173 [01:20<00:00, 14.55it/s]


In [19]:
recall_10 = (
    test_triplet.apply(lambda x: is_hit(x["anchor"], model, 10, data), axis=1).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric: ", recall_10)
recall_4 = (
    test_triplet.apply(lambda x: is_hit(x["anchor"], model, 4, data), axis=1).sum()
    / test_triplet.shape[0]
)
print("Recall@4 Metric: ", recall_4)
recall_10 = (
    test_triplet.apply(
        lambda x: percentage_hits(x["anchor"], model, 10, data), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric %: ", recall_10)
recall_4 = (
    test_triplet.apply(
        lambda x: percentage_hits(x["anchor"], model, 4, data), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@4 Metric %: ", recall_4)

Recall@10 Metric:  0.5015806111696522
Recall@4 Metric:  0.3403582718651212
Recall@10 Metric %:  0.4764518206299028
Recall@4 Metric %:  0.3213031260976466


In [20]:
dev_evaluator(model)

{'dev_evaluator_cosine_accuracy': 0.9989462494850159}

In [21]:
recall_10 = (
    test_triplet.apply(lambda x: is_hit(x["anchor"], model, 20, data), axis=1).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric: ", recall_10)
recall_10 = (
    test_triplet.apply(
        lambda x: percentage_hits(x["anchor"], model, 20, data), axis=1
    ).sum()
    / test_triplet.shape[0]
)
print("Recall@10 Metric %: ", recall_10)

Recall@10 Metric:  0.6301369863013698
Recall@10 Metric %:  0.6093469316072057


In [24]:
test_triplet["has hit"] = test_triplet.apply(
    lambda x: is_hit(x["anchor"], model, 20, data), axis=1
)

In [25]:
test_triplet

Unnamed: 0,anchor,positive,negative,has hit
5597,Samarbejde mellem offentlig og privat sektor,Corona har stillet helt særlige krav til perso...,Hurtig fejlfri levering. Gode priser.,1.0
4791,Demokratisk samarbejde og beslutningstagning,Vi står sammen ved at holde afstand. Alligevel...,Snabb leverans!!,0.0
2978,Ende på corona-restricitioner,"Og blandt danskere over 65 år, der gælder det,...",Snabb leverans.,0.0
5212,Psykisk sundhedsydelser i Danmark,Sort blokpolitik vil sætte Danmarks klimaindsa...,Jeg søgte en IKEA shopping card i 2018 og fik....,0.0
3945,Regjeringsændring og Valgkamp,"Da vi var samlet her i Aalborg sidste år, der ...",Rigtige gode priser og mange forskellige brand...,0.0
...,...,...,...,...
1591,Pandemis indvirkning på samfundet,Hvad enten det sker ved skærmen derhjemme elle...,Hurtig levering.,0.0
1990,Grønland-Danmark relationer og historiske spør...,"Og der må jeg bare sige, jeg synes, at det på ...",Bestilte i februar lidt forskelligt fra shampo...,0.0
4585,Nødvendigheden af fællesskab og samtale over i...,Tiden under pandemien bliver sommetider sammen...,Opdateret.\n\nLad jer ikke snyde af den fine s...,0.0
3819,Arbejdsstyrke,"Vi kan ikke undvære hinanden. Vi har brug for,...","Jeg bestilte en Nintendo Wii, dog på mail, som...",0.0


In [26]:
def is_hit(query: str, model: SentenceTransformer, k: int, data: dict):
    query_id = data["query_lk"][query]
    expected_ids = data["relevant_docs"][query_id]
    embedded_query = model.encode(query).tolist()

    conn = pg.connect("dbname=vector_rag user=postgres password=postgres")
    conn.autocommit = True
    cur = conn.cursor()

    cur.execute(
        "SELECT context_id, context FROM embeddings ORDER BY embedding <=> %s::vector LIMIT %s;",
        (str(embedded_query), str(k)),
    )

    results = cur.fetchall()

    cur.close()
    conn.close()

    return results

In [35]:
is_hit("Demokratisk samarbejde og beslutningstagning", model, 20, data)

[('dec43128-fce4-4035-a3bb-da121466716a',
  'De næste tre dage handler ikke først og fremmest at få ret. Eller om at få det sidste ord. Men om, at vi alle tager del i en fælles samtale. Om de store spørgsmål, vi står over for. Og om hvilken vej vi skal bevæge os. Demokrati er meget mere end at afgive sin stemme. Det er at bruge den. Og lytte til andres. Debattere. Være uenige. Blive klogere. Kun sådan kan vi flytte os som mennesker og som samfund.'),
 ('fe2ec220-5c97-4790-812b-1c0005014246',
  'Når den udvælgelse, der ligger bag hvilke opslag, man ser, primært bekræfter de holdninger, man selv havde i forvejen. Og dermed skaber såkaldte ekkokamre. Hvor vi kun mødes med mennesker, der er enige med os selv. I stedet for at lade meningerne brydes. Så dyrkes forskellene. Dynamikken forsvinder til fordel for retorikken. Og de simple svar tillægges den største værdi. Det bidrager til polarisering i vores samfund. Og udfordrer vores demokrati. Hal Koch beskrev det så godt allerede i 1945. Det