# Classifier using heuristics


In [None]:
upstream = None
product = None

model_name = "mxbai-embed-large"
embedding_model = "mxbai-embed-large"

In [None]:
import os

import numpy as np
import ollama
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/processed/task4/subtask_4a/"
else:
    ROOT_DIR = "./data/processed/task4/subtask_4a/"

In [None]:
# replace ture/false with 0/1:
def replace_true_false_with_0_1(df):
    df = df.replace({True: 1, False: 0})
    return df

In [None]:
labels_columns = ["scientific_claim", "scientific_reference", "scientific_entities"]
features = [
    "is_claim_with_sciterm",
    "is_claim",
    "contains_arg",
    "contains_scientific_term",
    "has_url",
    "has_sci_domain",
    "has_sci_subdomain",
    "has_sci_mag_domain",
    "has_sci_news_domain",
    "is_related_to_research",
    "mentions_science_research_in_general",
    "mentions_scientist",
    "mentions_publications",
    "mentions_research_method",
]

# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_train_clean_heuristics.csv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_dev_clean_heuristics.csv"),
    sep="\t",
)

## With embeddings


In [None]:
def generate_embeddings(texts: list[str], embedding_model: str = "mxbai-embed-large") -> list[np.ndarray]:
    """Generate embeddings for a given list of texts and their corresponding labels using the specified model.

    Args:
        texts (list[str]): A list of text strings to generate embeddings for.
        labels (list[float]): A list of labels corresponding to the texts.
        embedding_model (str): Name of the embedding model to use. Defaults to "mxbai-embed-large".

    Returns:
        pd.DataFrame: A DataFrame containing embeddings and their corresponding labels.
    """
    embeddings = []
    for text in texts:
        result = ollama.embed(model=embedding_model, input=str(text))
        embeddings.append(result.embeddings[0])
    return embeddings

In [None]:
# Lets do it again with the embeddings:

for cl in labels_columns:
    print(f"Evaluating {cl}...")
    subtask4a_cat_claim_train_df = pd.read_csv(
        os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}_heuristics.tsv"),
        sep="\t",
    )

    # Generate embeddings for oversampling, training, and evaluation:
    for dataset, name in [
        (subtask4a_train_df[["text", cl] + features], "training"),
        (subtask4a_cat_claim_train_df[["text", cl] + features], "oversampling"),
        (subtask4a_test_df[["text", cl] + features], "evaluation"),
    ]:
        if name == "training":
            train_df = dataset
            train_df["embeddings"] = generate_embeddings(train_df["text"].tolist(), embedding_model=embedding_model)
        if name == "oversampling":
            oversampling_df = dataset
            oversampling_df["embeddings"] = generate_embeddings(
                oversampling_df["text"].tolist(), embedding_model=embedding_model
            )
        elif name == "evaluation":
            test_df = dataset
            test_df["embeddings"] = generate_embeddings(test_df["text"].tolist(), embedding_model=embedding_model)

    # Find the best model without using oversampling:
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)

    X_train = np.array(
        [
            x + y
            for x, y in zip(
                train_df["embeddings"].values.tolist(),
                train_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
    )

    y_train = train_df[cl].tolist()
    X_train, y_train = shuffle(X_train, y_train)

    X_test = np.array(
        [
            x + y
            for x, y in zip(
                test_df["embeddings"].values.tolist(),
                test_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
    )
    y_test = test_df[cl].tolist()

    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
    display(models.sort_values("f1_score", ascending=False))

    # Do it again with oversampling + training:
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)
    X_train = np.array(
        [
            x + y
            for x, y in zip(
                train_df["embeddings"].values.tolist(),
                train_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
        + [
            x + y
            for x, y in zip(
                oversampling_df["embeddings"].values.tolist(),
                oversampling_df[features].replace({True: 1, False: 0}).values.tolist(),
            )
        ]
    )

    y_train = pd.concat([train_df, oversampling_df])[cl].tolist()
    X_train, y_train = shuffle(X_train, y_train)

    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
    display(models.sort_values("f1_score", ascending=False))

Evaluating scientific_claim...


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 332, number of negative: 896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261159
[LightGBM] [Info] Number of data points in the train set: 1228, number of used features: 1037
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.270358 -> initscore=-0.992805
[LightGBM] [Info] Start training from score -0.992805


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVC,0.91,0.81,0.81,0.91,0.74,0.31
KNeighborsClassifier,0.88,0.85,0.85,0.89,0.72,0.03
LGBMClassifier,0.9,0.79,0.79,0.89,0.7,1.69
NearestCentroid,0.83,0.85,0.85,0.85,0.67,0.08
XGBClassifier,0.89,0.77,0.77,0.88,0.67,1.04
NuSVC,0.9,0.76,0.76,0.89,0.67,0.33
GaussianNB,0.84,0.84,0.84,0.85,0.67,0.04
BernoulliNB,0.82,0.85,0.85,0.84,0.66,0.05
SGDClassifier,0.88,0.78,0.78,0.87,0.65,0.09
CalibratedClassifierCV,0.88,0.77,0.77,0.88,0.65,1.62


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 664, number of negative: 896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261159
[LightGBM] [Info] Number of data points in the train set: 1560, number of used features: 1037
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.425641 -> initscore=-0.299658
[LightGBM] [Info] Start training from score -0.299658


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RandomForestClassifier,0.89,0.81,0.81,0.89,0.71,2.33
SVC,0.88,0.82,0.82,0.88,0.7,0.51
NuSVC,0.88,0.82,0.82,0.88,0.7,0.63
XGBClassifier,0.88,0.81,0.81,0.88,0.69,1.26
Perceptron,0.85,0.81,0.81,0.86,0.66,0.15
SGDClassifier,0.86,0.8,0.8,0.86,0.65,0.13
LGBMClassifier,0.86,0.78,0.78,0.86,0.64,1.81
BernoulliNB,0.8,0.84,0.84,0.82,0.63,0.06
NearestCentroid,0.8,0.84,0.84,0.82,0.63,0.1
GaussianNB,0.8,0.83,0.83,0.82,0.63,0.05


Evaluating scientific_reference...


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 224, number of negative: 1004
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261159
[LightGBM] [Info] Number of data points in the train set: 1228, number of used features: 1037
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.182410 -> initscore=-1.500101
[LightGBM] [Info] Start training from score -1.500101


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GaussianNB,0.9,0.91,0.91,0.9,0.77,0.04
SGDClassifier,0.91,0.83,0.83,0.91,0.75,0.1
LogisticRegression,0.9,0.83,0.83,0.9,0.73,0.11
BernoulliNB,0.88,0.88,0.88,0.88,0.73,0.05
NearestCentroid,0.85,0.88,0.88,0.87,0.71,0.08
Perceptron,0.88,0.82,0.82,0.88,0.7,0.09
XGBClassifier,0.9,0.79,0.79,0.89,0.7,1.03
PassiveAggressiveClassifier,0.88,0.81,0.81,0.88,0.68,0.19
KNeighborsClassifier,0.88,0.78,0.78,0.88,0.67,0.03
AdaBoostClassifier,0.88,0.77,0.77,0.88,0.65,5.38


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 896, number of negative: 1004
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261159
[LightGBM] [Info] Number of data points in the train set: 1900, number of used features: 1037
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.471579 -> initscore=-0.113807
[LightGBM] [Info] Start training from score -0.113807


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GaussianNB,0.9,0.89,0.89,0.9,0.77,0.05
NuSVC,0.91,0.87,0.87,0.91,0.76,0.87
SVC,0.91,0.84,0.84,0.9,0.75,0.7
RandomForestClassifier,0.88,0.82,0.82,0.88,0.7,3.08
BernoulliNB,0.86,0.86,0.86,0.87,0.7,0.07
NearestCentroid,0.85,0.85,0.85,0.86,0.68,0.08
ExtraTreesClassifier,0.88,0.79,0.79,0.87,0.67,0.56
LogisticRegression,0.87,0.79,0.79,0.87,0.65,0.12
CalibratedClassifierCV,0.88,0.78,0.78,0.87,0.65,6.68
XGBClassifier,0.86,0.77,0.77,0.86,0.63,1.32


Evaluating scientific_entities...


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 306, number of negative: 922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261159
[LightGBM] [Info] Number of data points in the train set: 1228, number of used features: 1037
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249186 -> initscore=-1.102960
[LightGBM] [Info] Start training from score -1.102960


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GaussianNB,0.92,0.93,0.93,0.92,0.85,0.04
SGDClassifier,0.91,0.87,0.87,0.91,0.82,0.05
XGBClassifier,0.91,0.85,0.85,0.91,0.81,0.89
LogisticRegression,0.91,0.87,0.87,0.9,0.81,0.09
BernoulliNB,0.88,0.89,0.89,0.89,0.79,0.06
NearestCentroid,0.88,0.9,0.9,0.88,0.79,0.07
AdaBoostClassifier,0.9,0.83,0.83,0.89,0.77,5.56
Perceptron,0.88,0.85,0.85,0.88,0.76,0.07
SVC,0.9,0.81,0.81,0.89,0.76,0.25
CalibratedClassifierCV,0.89,0.82,0.82,0.89,0.75,1.29


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 918, number of negative: 922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261159
[LightGBM] [Info] Number of data points in the train set: 1840, number of used features: 1037
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498913 -> initscore=-0.004348
[LightGBM] [Info] Start training from score -0.004348


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,f1_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NuSVC,0.93,0.91,0.91,0.93,0.86,0.8
ExtraTreesClassifier,0.93,0.9,0.9,0.93,0.85,0.54
GaussianNB,0.91,0.91,0.91,0.91,0.84,0.05
SVC,0.92,0.89,0.89,0.92,0.84,0.59
RandomForestClassifier,0.92,0.89,0.89,0.92,0.84,2.92
XGBClassifier,0.9,0.88,0.88,0.9,0.81,1.21
CalibratedClassifierCV,0.89,0.86,0.86,0.89,0.78,2.53
SGDClassifier,0.89,0.85,0.85,0.89,0.78,0.11
BernoulliNB,0.87,0.88,0.88,0.87,0.78,0.06
LogisticRegression,0.88,0.85,0.85,0.88,0.77,0.14
