# 6. Modelling & Experiements

In [None]:
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datasets import Dataset

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    pipeline
)

import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available.")


CUDA available: True
GPU count: 1
GPU name: NVIDIA GeForce RTX 4050 Laptop GPU


In [None]:
df = pd.read_csv("../data/cleaned.csv")
df = df[["comment", "label_encoded"]].rename(columns={"comment": "text", "label_encoded": "label"})

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
class ExperimentRunner:

    def __init__(self, train_ds, test_ds, num_labels=6, experiment_name="YouTubeCommentClassifier"):
        """
        Initialize Experiment Runner with datasets and MLflow setup.
        """
        self.train_ds = train_ds
        self.test_ds = test_ds
        self.num_labels = num_labels
        self.experiment_name = experiment_name

        mlflow.set_tracking_uri("file:../mlruns")
        mlflow.set_experiment(experiment_name)


    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc}


    def _tokenize(self, batch):
        return self.tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)


    def train_and_evaluate(self, model_name, learning_rate=2e-5, batch_size=8, num_epochs=5, patience=2):
        """Train, evaluate, and log an experiment to MLflow."""
        if mlflow.active_run():
            mlflow.end_run()

        print(f"\nRunning experiment: {model_name} | LR={learning_rate} | BS={batch_size}")

        experiment_dir = f"../experiment_results/{model_name.replace('/', '_')}_lr{learning_rate}_bs{batch_size}_ep{num_epochs}"
        os.makedirs(experiment_dir, exist_ok=True)

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels)

        train_enc = self.train_ds.map(self._tokenize, batched=True)
        test_enc = self.test_ds.map(self._tokenize, batched=True)
        train_enc.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        test_enc.set_format("torch", columns=["input_ids", "attention_mask", "label"])

        training_args = TrainingArguments(
            output_dir=experiment_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epochs,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            logging_dir="./logs",
            logging_steps=50,
        )

        with mlflow.start_run(run_name=f"{model_name}_lr{learning_rate}_bs{batch_size}"):
            mlflow.log_param("model_name", model_name)
            mlflow.log_param("learning_rate", learning_rate)
            mlflow.log_param("batch_size", batch_size)
            mlflow.log_param("epochs", num_epochs)
            mlflow.log_param("patience", patience)

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_enc,
                eval_dataset=test_enc,
                tokenizer=self.tokenizer,
                compute_metrics=self.compute_metrics,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)],
            )

            trainer.train()
            results = trainer.evaluate()

            for k, v in results.items():
                mlflow.log_metric(k, v)

            print("Generating classification report and confusion matrix ...")
            preds_output = trainer.predict(test_enc)
            y_true = preds_output.label_ids
            y_pred = preds_output.predictions.argmax(-1)

            report = classification_report(y_true, y_pred, digits=3)
            report_path = os.path.join(experiment_dir, f"classification_report_{model_name.replace('/', '_')}_lr{learning_rate}_bs{batch_size}_ep{num_epochs}.txt")
            with open(report_path, "w") as f:
                f.write(report)
            mlflow.log_artifact(report_path)


            cm = confusion_matrix(y_true, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
            plt.xlabel("Predicted")
            plt.ylabel("True")
            plt.title(f"{model_name} Confusion Matrix")
            cm_path = os.path.join(experiment_dir, f"confusion_matrix_{model_name.replace('/', '_')}_lr{learning_rate}_bs{batch_size}_ep{num_epochs}.png")
            plt.tight_layout()
            plt.savefig(cm_path)
            mlflow.log_artifact(cm_path)
            plt.close()

            history = pd.DataFrame(trainer.state.log_history)
            train_loss = history[history["loss"].notna()][["epoch", "loss"]]
            eval_loss = history[history["eval_loss"].notna()][["epoch", "eval_loss"]]

            if not train_loss.empty and not eval_loss.empty:
                plt.figure(figsize=(8, 5))
                plt.plot(train_loss["epoch"], train_loss["loss"], marker="o", label="Training Loss")
                plt.plot(eval_loss["epoch"], eval_loss["eval_loss"], marker="s", label="Validation Loss")
                plt.legend()
                plt.title(f"{model_name} - LR={learning_rate}")
                plt.xlabel("Epoch")
                plt.ylabel("Loss")
                plt.tight_layout()
                plot_path = os.path.join(experiment_dir, f"loss_curve_{model_name.replace('/', '_')}_lr{learning_rate}_bs{batch_size}_ep{num_epochs}.png")
                plt.savefig(plot_path)
                mlflow.log_artifact(plot_path)
                plt.close()

        print(f"‚úÖ Experiment complete. Results saved in: {experiment_dir}\n")

In [None]:
models = ["distilbert-base-uncased", "bert-base-uncased", "roberta-base"]

learning_rates = [2e-5, 3e-5]
batch_sizes = [8, 16]
epochs = [5]
runner = ExperimentRunner(train_ds, test_ds)
for model_name, lr, bs, ep in itertools.product(models, learning_rates, batch_sizes, epochs):
    runner.train_and_evaluate(model_name, learning_rate=lr, batch_size=bs, num_epochs=ep)



Running experiment: distilbert-base-uncased | LR=2e-05 | BS=8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 19826.52 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 18585.90 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0653,0.815665,0.760274
2,0.6404,0.5934,0.815068
3,0.3903,0.577799,0.821918
4,0.3119,0.56023,0.842466
5,0.2327,0.587653,0.84589


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/distilbert-base-uncased_lr2e-05_bs8_ep5


Running experiment: distilbert-base-uncased | LR=2e-05 | BS=16


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 15220.14 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 9110.79 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3136,0.946048,0.678082
2,0.9649,0.731737,0.797945
3,0.5768,0.671715,0.815068
4,0.5497,0.628248,0.821918
5,0.3908,0.621672,0.825342


Generating classification report and confusion matrix ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


‚úÖ Experiment complete. Results saved in: ../experiment_results/distilbert-base-uncased_lr2e-05_bs16_ep5


Running experiment: distilbert-base-uncased | LR=3e-05 | BS=8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 10689.41 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 13274.84 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9175,0.721585,0.780822
2,0.5553,0.567743,0.815068
3,0.2883,0.652985,0.818493
4,0.2275,0.631905,0.84589
5,0.1337,0.674685,0.842466


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/distilbert-base-uncased_lr3e-05_bs8_ep5


Running experiment: distilbert-base-uncased | LR=3e-05 | BS=16


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 11910.90 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 8343.18 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2315,0.839995,0.746575
2,0.8695,0.640161,0.815068
3,0.4229,0.631667,0.808219
4,0.4038,0.606172,0.828767
5,0.2379,0.604094,0.832192


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/distilbert-base-uncased_lr3e-05_bs16_ep5


Running experiment: bert-base-uncased | LR=2e-05 | BS=8


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 9042.14 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 8574.13 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0476,0.863121,0.691781
2,0.6458,0.651368,0.80137
3,0.3692,0.599525,0.825342
4,0.2835,0.583486,0.849315
5,0.1705,0.619596,0.84589


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/bert-base-uncased_lr2e-05_bs8_ep5


Running experiment: bert-base-uncased | LR=2e-05 | BS=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 9466.40 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 8554.24 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2427,0.989758,0.664384
2,1.0038,0.839443,0.702055
3,0.6289,0.761966,0.777397
4,0.5342,0.658326,0.808219
5,0.3513,0.630374,0.815068


Generating classification report and confusion matrix ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


‚úÖ Experiment complete. Results saved in: ../experiment_results/bert-base-uncased_lr2e-05_bs16_ep5


Running experiment: bert-base-uncased | LR=3e-05 | BS=8


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 5972.56 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 3719.10 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9925,0.791349,0.760274
2,0.5858,0.536484,0.839041
3,0.305,0.611161,0.839041
4,0.2295,0.646406,0.839041


Generating classification report and confusion matrix ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


‚úÖ Experiment complete. Results saved in: ../experiment_results/bert-base-uncased_lr3e-05_bs8_ep5


Running experiment: bert-base-uncased | LR=3e-05 | BS=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 9913.54 examples/s] 
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 8090.69 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1751,0.902798,0.705479
2,0.8839,0.711396,0.780822
3,0.4946,0.621572,0.797945
4,0.4247,0.573847,0.828767
5,0.2382,0.564656,0.825342


Generating classification report and confusion matrix ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


‚úÖ Experiment complete. Results saved in: ../experiment_results/bert-base-uncased_lr3e-05_bs16_ep5


Running experiment: roberta-base | LR=2e-05 | BS=8


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 9767.02 examples/s] 
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 10403.81 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9938,0.63326,0.808219
2,0.5093,0.501934,0.828767
3,0.3287,0.645235,0.825342
4,0.2476,0.694889,0.825342


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/roberta-base_lr2e-05_bs8_ep5


Running experiment: roberta-base | LR=2e-05 | BS=16


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 11708.97 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 9363.93 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2544,0.932284,0.660959
2,0.9576,0.527952,0.835616
3,0.4463,0.492799,0.84589
4,0.3974,0.494586,0.835616
5,0.2591,0.481938,0.84589


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/roberta-base_lr2e-05_bs16_ep5


Running experiment: roberta-base | LR=3e-05 | BS=8


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 11514.61 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 10726.18 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8612,0.628952,0.791096
2,0.5308,0.516321,0.835616
3,0.3166,0.66044,0.84589
4,0.2023,0.713954,0.84589
5,0.1046,0.781784,0.835616


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/roberta-base_lr3e-05_bs8_ep5


Running experiment: roberta-base | LR=3e-05 | BS=16


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [00:00<00:00, 12402.47 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292/292 [00:00<00:00, 11017.49 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2282,0.887397,0.75
2,0.9195,0.499171,0.818493
3,0.3884,0.5308,0.835616
4,0.3189,0.513739,0.84589
5,0.1589,0.541618,0.84589


Generating classification report and confusion matrix ...
‚úÖ Experiment complete. Results saved in: ../experiment_results/roberta-base_lr3e-05_bs16_ep5



In [None]:
model_path = "../experiment_results/bert-base-uncased_lr2e-05_bs8_ep5/checkpoint-730"

output_dir = "../final_model"
os.makedirs(output_dir, exist_ok=True)

print(f"Loading model from: {model_path}")
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

pickle_path = os.path.join(output_dir, "final_model.pkl")
with open(pickle_path, "wb") as f:
    pickle.dump({"model": model, "tokenizer": tokenizer}, f)

print(f"Model and tokenizer successfully saved ‚Üí {pickle_path}")


üîç Loading model from: ../experiment_results/bert-base-uncased_lr2e-05_bs8_ep5/checkpoint-730
‚úÖ Model and tokenizer successfully saved ‚Üí ../experiment_results/final_model\final_model.pkl
