In [10]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
import random
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.nn import CrossEntropyLoss
import nltk

In [9]:
df = pd.read_csv("dataset/combined_GH_HF_manual.csv")

# Function to clean text columns
def clean_text(text):
    # Remove non-ASCII characters (corrupted/malformed characters)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

# Apply cleaning function to relevant columns
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].notna() & (df['message'] != '') & (df['message'].str.split().str.len() > 1)]
df["label"] = df["label"].str.lower()

number_of_labels = df["label"].value_counts()
number_of_labels

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["label"]), 
    y=df["label"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Check class weights
print("Class Weights:", class_weights)


# First, split into training (80%) and testing (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Then, split the training set into training (80% of 80% = 64%) and validation (20% of 80% = 16%)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Ensure labels are of type integer
train_dataset = train_dataset.map(lambda e: {'label': int(e['label'])})
val_dataset = val_dataset.map(lambda e: {'label': int(e['label'])})
test_dataset = test_dataset.map(lambda e: {'label': int(e['label'])})

print(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}, Test samples: {len(test_dataset)}")


model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

class DistilBERTWithWeightedLoss(nn.Module):
    def __init__(self, model_name, num_labels, class_weights, dropout=0.4):
        super().__init__()
        
        config = DistilBertConfig.from_pretrained(
            model_name, 
            num_labels=num_labels, 
            hidden_dropout_prob=dropout,
            attention_probs_dropout_prob=dropout
        )

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

    
device = torch.device("mps" if torch.has_mps else "cpu")
print(f"Using device: {device}")

# Initialize model
num_labels = len(label_encoder.classes_)
model = DistilBERTWithWeightedLoss(model_name, num_labels, class_weights.to(device), dropout=0.4)
model.to(device)



# Initialize model
num_labels = len(label_encoder.classes_)  # Count unique labels
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBERTWithWeightedLoss(model_name, num_labels, class_weights, dropout=0.4)

model.to(device)  # Move model to GPU if available

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Keep only the best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # Lower loss is better
    learning_rate= 1e-5,  # Lower learning rate to improve stability
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,  # Reduce epochs to prevent overfitting
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_steps=1000,  # Log less frequently to reduce clutter
    report_to="none",  # Prevents reporting to WandB, TensorBoard, etc.
    fp16=False,  # Disable mixed precision to avoid MPS issue
    warmup_ratio=0.1,  # Warm-up for the first 10% of training
    lr_scheduler_type="linear",
    gradient_accumulation_steps=2,  # Helps stabilize training
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=1
    )
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")
config.save_pretrained("./final_model")

Class Weights: tensor([4.6842, 0.1776, 2.5429, 2.5429, 0.4564, 4.9444, 2.8710, 2.6970, 1.4590,
        0.6312, 5.9333, 0.7008, 1.6182, 5.2353, 1.7115])


  if _pandas_api.is_sparse(col):


Map:   0%|          | 0/854 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Training samples: 854, Validation samples: 214, Test samples: 267


Map:   0%|          | 0/854 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Using device: mps


  device = torch.device("mps" if torch.has_mps else "cpu")
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.683,2.689718,0.258427,0.661099,0.258427,0.269544
2,2.6488,2.617299,0.483146,0.591568,0.483146,0.416768
3,2.5232,2.461237,0.501873,0.675846,0.501873,0.473595
4,2.3071,2.319612,0.561798,0.629688,0.561798,0.522
5,2.1704,2.211452,0.565543,0.637473,0.565543,0.533553
6,2.032,2.117745,0.558052,0.644434,0.558052,0.547906
7,1.8997,2.056645,0.565543,0.634916,0.565543,0.559691
8,1.787,1.991419,0.580524,0.648735,0.580524,0.573389
9,1.693,1.944241,0.576779,0.654627,0.576779,0.577911
10,1.5754,1.897902,0.58427,0.62523,0.58427,0.586716
