# Simple Embedding  Models with a classificaiton head


In [None]:
upstream = None
product = None

model_name = "mxbai-embed-large"
embedding_model = "mxbai-embed-large"

In [None]:
import os

import numpy as np
import ollama
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils import shuffle

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/processed/task4/subtask_4a/"
else:
    ROOT_DIR = "./data/processed/task4/subtask_4a/"

In [3]:
def generate_embeddings_with_labels(
    texts: list[str], labels: list[float], embedding_model: str = "mxbai-embed-large"
) -> pd.DataFrame:
    """Generate embeddings for a given list of texts and their corresponding labels using the specified model.

    Args:
        texts (list[str]): A list of text strings to generate embeddings for.
        labels (list[float]): A list of labels corresponding to the texts.
        embedding_model (str): Name of the embedding model to use. Defaults to "mxbai-embed-large".

    Returns:
        pd.DataFrame: A DataFrame containing embeddings and their corresponding labels.
    """
    embeddings = []
    for text, label in zip(texts, labels):
        result = ollama.embed(model=embedding_model, input=str(text))
        embeddings.append({"embedding": result.embeddings[0], "label": label})
    return pd.DataFrame(embeddings)

In [None]:
embedding_model = "mxbai-embed-large"
columns = ["scientific_claim", "scientific_reference", "scientific_entities"]
# columns = ["scientific_reference", "scientific_entities"]


# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_train_clean.tsv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_dev_clean.tsv"),
    sep="\t",
)

for cl in columns:
    # Load the data
    # Check if oversampling file exists:
    if os.path.exists(os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv")):
        subtask4a_cat_claim_train_df = pd.read_csv(
            os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv"),
            sep="\t",
            header=None,
            names=["text", cl],
        )

        print(f"Evaluating {cl}...")
        # Generate embeddings for oversampling, training, and evaluation:
        for dataset, name in [
            (subtask4a_cat_claim_train_df, "oversampling"),
            (subtask4a_train_df[["text", cl]], "training"),
            (subtask4a_test_df[["text", cl]], "evaluation"),
        ]:
            embeddings_df = generate_embeddings_with_labels(
                dataset["text"].tolist(),
                dataset[cl].tolist(),
                embedding_model=embedding_model,
            )
            if name == "oversampling":
                oversampling_embeddings_df = embeddings_df
            elif name == "training":
                standard_embeddings_df = embeddings_df
            elif name == "evaluation":
                eval_embeddings_df = embeddings_df

        # Find the best model without using oversampling:
        clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)

        X_train = np.array(standard_embeddings_df["embedding"].tolist())
        y_train = standard_embeddings_df["label"].tolist()
        X_train, y_train = shuffle(X_train, y_train)

        X_test = np.array(eval_embeddings_df["embedding"].tolist())
        y_test = eval_embeddings_df["label"].tolist()

        models, predictions = clf.fit(X_train, X_test, y_train, y_test)
        display(models.sort_values("f1_score", ascending=False))

        # Do it again with oversampling + training:
        clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)
        X_train = np.array(pd.concat([standard_embeddings_df, oversampling_embeddings_df])["embedding"].tolist())
        y_train = pd.concat([standard_embeddings_df, oversampling_embeddings_df])["label"].tolist()
        X_train, y_train = shuffle(X_train, y_train)

        models, predictions = clf.fit(X_train, X_test, y_train, y_test)
        display(models.sort_values("f1_score", ascending=False))

In [None]:
import numpy as np
from sklearn.svm import SVC

clf = SVC().fit(X_train, y_train)

In [6]:
preds = clf.predict(X_test)

metrics = {
    "f1": f1_score(y_test, preds),
    "precision": precision_score(y_test, preds),
    "recall": recall_score(y_test, preds),
    "accuracy": accuracy_score(y_test, preds),
}
metrics

{'f1': 0.8,
 'precision': 0.7777777777777778,
 'recall': 0.8235294117647058,
 'accuracy': 0.8978102189781022}