# Classifier using heuristics


In [None]:
upstream = None
product = None

model_name = "mxbai-embed-large"
embedding_model = "mxbai-embed-large"

In [None]:
import os

import numpy as np
import ollama
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/processed/task4/subtask_4a/"
else:
    ROOT_DIR = "./data/processed/task4/subtask_4a/"

In [None]:
# replace ture/false with 0/1:
def replace_true_false_with_0_1(df):
    df = df.replace({True: 1, False: 0})
    return df

In [None]:
labels_columns = ["scientific_claim", "scientific_reference", "scientific_entities"]
features = [
    "is_claim_with_sciterm",
    "is_claim",
    "contains_arg",
    "contains_scientific_term",
    "has_url",
    "has_sci_domain",
    "has_sci_subdomain",
    "has_sci_mag_domain",
    "has_sci_news_domain",
    "is_related_to_research",
    "mentions_science_research_in_general",
    "mentions_scientist",
    "mentions_publications",
    "mentions_research_method",
]

# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_train_clean_heuristics.csv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_dev_clean_heuristics.csv"),
    sep="\t",
)

## With embeddings


In [None]:
def generate_embeddings(texts: list[str], embedding_model: str = "mxbai-embed-large") -> list[np.ndarray]:
    """Generate embeddings for a given list of texts and their corresponding labels using the specified model.

    Args:
        texts (list[str]): A list of text strings to generate embeddings for.
        labels (list[float]): A list of labels corresponding to the texts.
        embedding_model (str): Name of the embedding model to use. Defaults to "mxbai-embed-large".

    Returns:
        pd.DataFrame: A DataFrame containing embeddings and their corresponding labels.
    """
    embeddings = []
    for text in texts:
        result = ollama.embed(model=embedding_model, input=str(text))
        embeddings.append(result.embeddings[0])
    return embeddings

In [None]:
# Lets do it again with the embeddings:

for cl in labels_columns:
    print(f"Evaluating {cl}...")
    subtask4a_cat_claim_train_df = pd.read_csv(
        os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}_heuristics.tsv"),
        sep="\t",
    )

    # Generate embeddings for oversampling, training, and evaluation:
    for dataset, name in [
        (subtask4a_train_df[["text", cl] + features], "training"),
        (subtask4a_cat_claim_train_df[["text", cl] + features], "oversampling"),
        (subtask4a_test_df[["text", cl] + features], "evaluation"),
    ]:
        if name == "training":
            train_df = dataset
            train_df["embeddings"] = generate_embeddings(train_df["text"].tolist(), embedding_model=embedding_model)
        if name == "oversampling":
            oversampling_df = dataset
            oversampling_df["embeddings"] = generate_embeddings(
                oversampling_df["text"].tolist(), embedding_model=embedding_model
            )
        elif name == "evaluation":
            test_df = dataset
            test_df["embeddings"] = generate_embeddings(test_df["text"].tolist(), embedding_model=embedding_model)

    # Find the best model without using oversampling:
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)

    X_train = np.array(
        [
            x + y
            for x, y in zip(
                train_df["embeddings"].values.tolist(),
                train_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
    )

    y_train = train_df[cl].tolist()
    X_train, y_train = shuffle(X_train, y_train)

    X_test = np.array(
        [
            x + y
            for x, y in zip(
                test_df["embeddings"].values.tolist(),
                test_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
    )
    y_test = test_df[cl].tolist()

    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
    display(models.sort_values("f1_score", ascending=False))

    # Do it again with oversampling + training:
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)
    X_train = np.array(
        [
            x + y
            for x, y in zip(
                train_df["embeddings"].values.tolist(),
                train_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
        + [
            x + y
            for x, y in zip(
                oversampling_df["embeddings"].values.tolist(),
                oversampling_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
    )

    y_train = pd.concat([train_df, oversampling_df])[cl].tolist()
    X_train, y_train = shuffle(X_train, y_train)

    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
    display(models.sort_values("f1_score", ascending=False))