# Simple Embedding  Models with a classificaiton head


In [None]:
upstream = None
product = None

model_name = "mxbai-embed-large"
embedding_model = "mxbai-embed-large"

In [None]:
import os

import numpy as np
import ollama
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils import shuffle

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/processed/task4/subtask_4a/"
else:
    ROOT_DIR = "./data/processed/task4/subtask_4a/"

In [3]:
def generate_embeddings_with_labels(
    texts: list[str], labels: list[float], embedding_model: str = "mxbai-embed-large"
) -> pd.DataFrame:
    """Generate embeddings for a given list of texts and their corresponding labels using the specified model.

    Args:
        texts (list[str]): A list of text strings to generate embeddings for.
        labels (list[float]): A list of labels corresponding to the texts.
        embedding_model (str): Name of the embedding model to use. Defaults to "mxbai-embed-large".

    Returns:
        pd.DataFrame: A DataFrame containing embeddings and their corresponding labels.
    """
    embeddings = []
    for text, label in zip(texts, labels):
        result = ollama.embed(model=embedding_model, input=str(text))
        embeddings.append({"embedding": result.embeddings[0], "label": label})
    return pd.DataFrame(embeddings)

In [None]:
embedding_model = "mxbai-embed-large"
columns = ["scientific_claim", "scientific_reference", "scientific_entities"]
# columns = ["scientific_reference", "scientific_entities"]


# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_train_clean.tsv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_dev_clean.tsv"),
    sep="\t",
)

for cl in columns:
    # Load the data
    # Check if oversampling file exists:
    if os.path.exists(os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv")):
        subtask4a_cat_claim_train_df = pd.read_csv(
            os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv"),
            sep="\t",
            header=None,
            names=["text", cl],
        )

        print(f"Evaluating {cl}...")
        # Generate embeddings for oversampling, training, and evaluation:
        for dataset, name in [
            (subtask4a_cat_claim_train_df, "oversampling"),
            (subtask4a_train_df[["text", cl]], "training"),
            (subtask4a_test_df[["text", cl]], "evaluation"),
        ]:
            embeddings_df = generate_embeddings_with_labels(
                dataset["text"].tolist(),
                dataset[cl].tolist(),
                embedding_model=embedding_model,
            )
            if name == "oversampling":
                oversampling_embeddings_df = embeddings_df
            elif name == "training":
                standard_embeddings_df = embeddings_df
            elif name == "evaluation":
                eval_embeddings_df = embeddings_df

        # Find the best model without using oversampling:
        clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)

        X_train = np.array(standard_embeddings_df["embedding"].tolist())
        y_train = standard_embeddings_df["label"].tolist()
        X_train, y_train = shuffle(X_train, y_train)

        X_test = np.array(eval_embeddings_df["embedding"].tolist())
        y_test = eval_embeddings_df["label"].tolist()

        models, predictions = clf.fit(X_train, X_test, y_train, y_test)
        display(models.sort_values("f1_score", ascending=False))

        # Do it again with oversampling + training:
        clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)
        X_train = np.array(pd.concat([standard_embeddings_df, oversampling_embeddings_df])["embedding"].tolist())
        y_train = pd.concat([standard_embeddings_df, oversampling_embeddings_df])["label"].tolist()
        X_train, y_train = shuffle(X_train, y_train)

        models, predictions = clf.fit(X_train, X_test, y_train, y_test)
        display(models.sort_values("f1_score", ascending=False))

Evaluating scientific_claim...


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 332, number of negative: 896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 1228, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.270358 -> initscore=-0.992805
[LightGBM] [Info] Start training from score -0.992805


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVC,0.91,0.79,0.79,0.9,0.71,0.33
SGDClassifier,0.89,0.81,0.81,0.89,0.71,0.13
BernoulliNB,0.85,0.88,0.88,0.86,0.7,0.04
LGBMClassifier,0.9,0.77,0.77,0.89,0.68,1.52
KNeighborsClassifier,0.87,0.82,0.82,0.87,0.68,0.09
GaussianNB,0.84,0.86,0.86,0.85,0.68,0.03
NearestCentroid,0.83,0.85,0.85,0.85,0.67,0.05
NuSVC,0.9,0.76,0.76,0.89,0.67,0.34
XGBClassifier,0.89,0.76,0.76,0.88,0.65,1.06
RandomForestClassifier,0.89,0.74,0.74,0.88,0.63,1.99


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 664, number of negative: 896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 1560, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.425641 -> initscore=-0.299658
[LightGBM] [Info] Start training from score -0.299658


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVC,0.9,0.86,0.86,0.9,0.75,0.54
NuSVC,0.9,0.86,0.86,0.9,0.75,0.62
XGBClassifier,0.89,0.83,0.83,0.89,0.72,1.28
RandomForestClassifier,0.88,0.82,0.82,0.88,0.7,2.32
LGBMClassifier,0.88,0.81,0.81,0.88,0.68,1.61
AdaBoostClassifier,0.86,0.83,0.83,0.87,0.68,6.8
BaggingClassifier,0.88,0.79,0.79,0.87,0.67,7.63
SGDClassifier,0.87,0.8,0.8,0.87,0.67,0.14
GaussianNB,0.82,0.86,0.86,0.83,0.66,0.04
ExtraTreesClassifier,0.87,0.79,0.79,0.87,0.65,0.48


Evaluating scientific_reference...


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 224, number of negative: 1004
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011824 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 1228, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.182410 -> initscore=-1.500101
[LightGBM] [Info] Start training from score -1.500101


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LogisticRegression,0.91,0.84,0.84,0.9,0.75,0.58
BernoulliNB,0.88,0.88,0.88,0.89,0.74,0.08
GaussianNB,0.88,0.88,0.88,0.89,0.74,0.14
KNeighborsClassifier,0.91,0.82,0.82,0.9,0.73,0.05
SGDClassifier,0.91,0.81,0.81,0.9,0.72,0.12
NearestCentroid,0.85,0.88,0.88,0.87,0.71,0.07
PassiveAggressiveClassifier,0.88,0.81,0.81,0.88,0.69,0.31
Perceptron,0.85,0.79,0.79,0.86,0.64,0.16
SVC,0.89,0.73,0.73,0.88,0.62,0.28
LGBMClassifier,0.89,0.73,0.73,0.88,0.62,2.35


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 896, number of negative: 1004
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 1900, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.471579 -> initscore=-0.113807
[LightGBM] [Info] Start training from score -0.113807


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NuSVC,0.91,0.87,0.87,0.91,0.76,0.96
RandomForestClassifier,0.91,0.85,0.85,0.91,0.75,3.09
GaussianNB,0.87,0.86,0.86,0.88,0.71,0.05
SVC,0.9,0.8,0.8,0.89,0.71,0.69
BernoulliNB,0.86,0.86,0.86,0.87,0.7,0.07
NearestCentroid,0.85,0.85,0.85,0.86,0.68,0.08
LGBMClassifier,0.86,0.78,0.78,0.86,0.64,1.79
SGDClassifier,0.86,0.78,0.78,0.86,0.64,0.16
ExtraTreesClassifier,0.87,0.77,0.77,0.87,0.64,0.58
KNeighborsClassifier,0.8,0.84,0.84,0.82,0.63,0.04


Evaluating scientific_entities...


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 306, number of negative: 922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 1228, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249186 -> initscore=-1.102960
[LightGBM] [Info] Start training from score -1.102960


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GaussianNB,0.92,0.92,0.92,0.92,0.85,0.04
BernoulliNB,0.89,0.9,0.9,0.89,0.81,0.05
NearestCentroid,0.88,0.9,0.9,0.89,0.8,0.07
XGBClassifier,0.91,0.83,0.83,0.9,0.78,1.01
SVC,0.91,0.83,0.83,0.9,0.78,0.31
SGDClassifier,0.9,0.82,0.82,0.89,0.77,0.11
LogisticRegression,0.88,0.83,0.83,0.88,0.76,0.1
Perceptron,0.87,0.85,0.85,0.87,0.76,0.06
KNeighborsClassifier,0.86,0.86,0.86,0.87,0.75,0.04
LinearSVC,0.87,0.83,0.83,0.87,0.74,5.55


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 918, number of negative: 922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 1840, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498913 -> initscore=-0.004348
[LightGBM] [Info] Start training from score -0.004348


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ExtraTreesClassifier,0.93,0.92,0.92,0.93,0.86,0.54
NuSVC,0.93,0.9,0.9,0.93,0.85,0.82
GaussianNB,0.9,0.89,0.89,0.9,0.81,0.05
RandomForestClassifier,0.9,0.88,0.88,0.9,0.81,2.95
XGBClassifier,0.89,0.88,0.88,0.89,0.79,1.27
LGBMClassifier,0.9,0.86,0.86,0.9,0.79,1.71
SVC,0.9,0.86,0.86,0.9,0.79,0.6
BernoulliNB,0.88,0.88,0.88,0.88,0.78,0.07
NearestCentroid,0.88,0.88,0.88,0.88,0.78,0.12
LogisticRegression,0.88,0.85,0.85,0.88,0.76,0.15


In [None]:
import numpy as np
from sklearn.svm import SVC

clf = SVC().fit(X_train, y_train)

In [6]:
preds = clf.predict(X_test)

metrics = {
    "f1": f1_score(y_test, preds),
    "precision": precision_score(y_test, preds),
    "recall": recall_score(y_test, preds),
    "accuracy": accuracy_score(y_test, preds),
}
metrics

{'f1': 0.8,
 'precision': 0.7777777777777778,
 'recall': 0.8235294117647058,
 'accuracy': 0.8978102189781022}