## Imports and Definitions ##

In [6]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import StratifiedKFold
from transformers import set_seed, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, AutoConfig, Trainer, TextClassificationPipeline, pipeline, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import json
import os
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

def compute_metrics_two(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

class CustomLabelEncoder:
    def __init__(self):
        self.encoder = LabelEncoder()
        self.classes_ = None

    def fit(self, y):
        self.encoder.fit(y)
        self.classes_ = self.encoder.classes_
        return self

    def transform(self, y):
        new_labels = set(y) - set(self.encoder.classes_)
        if new_labels:
            for label in new_labels:
                self.classes_ = np.append(self.classes_, label)
            self.encoder.classes_ = self.classes_
        return self.encoder.transform(y)

    def fit_transform(self, y):
        return self.fit(y).transform(y)

    def inverse_transform(self, y):
        return self.encoder.inverse_transform(y)

def number_classes(data):
    unique_labels = set()
    
    for dataset_name in data:
        dataset = data[dataset_name]

        df = pd.DataFrame(dataset)
        df = df.dropna(subset=['labels'])
        
        unique_labels.update(df['labels'].unique())
    
    return len(unique_labels)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def tokenize_function(batch):
    tokenized_batch = tokenizer(batch['text'], truncation=True)
    return tokenized_batch


def classify_texts_in_batches(texts, batch_size=64):
    n = len(texts)
    predictions_and_scores = []
    for start_idx in range(0, n, batch_size):
        end_idx = min(start_idx + batch_size, n)
        batch = texts[start_idx:end_idx]
        batch_results = classifier(batch, truncation=True)
        batch_predictions_and_scores = [(result['label'], result['score']) for result in batch_results]
        predictions_and_scores.extend(batch_predictions_and_scores)
        print(f"Processed {end_idx}/{n} texts")
    return predictions_and_scores
    
checkpoint = 'bert-base-multilingual-uncased'
classifier_type = 'product'
dataset_path = '/scratch/data_processed.csv'
seed = 42
epochs = 5
resample_to_100 = False

## Load the Dataset and Convert it to a HuggingFace Dataset ##

In [7]:
df = pd.read_csv(dataset_path)
df = df.dropna(subset=[classifier_type])
df = df.rename(columns={classifier_type: 'labels'})

# Filter out classes with counts less than 3
label_counts = df['labels'].value_counts()
classes_to_keep = label_counts[label_counts > 2].index.tolist()
df = df[df['labels'].isin(classes_to_keep)]

# Convert textual labels to unique integers
unique_labels = df['labels'].unique()
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
df['labels'] = df['labels'].map(label_to_id)

# Stratified K-Fold Cross-Validation
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

# Store each fold's dataset
dataset_folds = []

# Split the dataset into K folds
for train_index, test_val_index in skf.split(df, df['labels']):
    train_df = df.iloc[train_index]
    test_val_df = df.iloc[test_val_index]

    test_df, validation_df = train_test_split(test_val_df, test_size=0.5, stratify=test_val_df['labels'], random_state=seed)

    # Convert to HuggingFace Dataset
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    validation_dataset = Dataset.from_pandas(validation_df)

    data = DatasetDict({
        'train': train_dataset,
        'test': test_dataset,
        'validation': validation_dataset
    })
    
    if (resample_to_100):
        df_train = data['train'].to_pandas()
    
        X = df_train.drop('labels', axis=1)
        y = df_train['labels']

        # Determine the number of samples per class
        desired_samples = 100
        resampled_data = []

        for class_label in y.unique():
            class_data = df_train[df_train['labels'] == class_label]
    
            if class_data.shape[0] > desired_samples:
                # Downsample if more than desired_samples
                resampled_class_data = resample(class_data, replace=False, n_samples=desired_samples, random_state=seed)
            elif class_data.shape[0] < desired_samples:
                # Oversample if fewer than desired_samples
                resampled_class_data = resample(class_data, replace=True, n_samples=desired_samples, random_state=seed)
            else:
                # If exactly desired_samples, just use the class_data as is
                resampled_class_data = class_data
        
            resampled_data.append(resampled_class_data)

        df_resampled = pd.concat(resampled_data)
        df_resampled = df_resampled.sample(frac=1, random_state=seed).reset_index(drop=True)
        data_resampled = Dataset.from_pandas(df_resampled)
        data['train'] = data_resampled
    
    dataset_folds.append(data)

## Train Model ##

In [None]:
set_seed(seed)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Save results for each fold
results_bert = []
results_xgboost = []

with open(f'results_ensemble_{classifier_type}.txt', 'w') as result_file:
    # Loop over each fold in the dataset_folds list
    for fold_number, data in enumerate(dataset_folds):
        
        # Fine-tune BERT
        tokenized_datasets = data.map(lambda examples: tokenizer(examples['text'], truncation=True, padding=True), batched=True)

        config = AutoConfig.from_pretrained(checkpoint, label2id=label_to_id, id2label=id_to_label, num_labels=number_classes(data))
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)

        training_args = TrainingArguments(
            output_dir=f'trainer-output-fold-{fold_number}',
            num_train_epochs=epochs,
            evaluation_strategy='epoch',
            logging_strategy='epoch',
            load_best_model_at_end=True,
            save_strategy='epoch',
            metric_for_best_model='eval_loss',
            greater_is_better=False
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets['train'],
            eval_dataset=tokenized_datasets['validation'],
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        trainer.train()
        results = trainer.evaluate(tokenized_datasets['test']) # Evaluate the model on the current fold's test dataset
        results_bert.append(results)
        result_file.write(f"Fold {fold_number} Test Results BERT: {results}\n")
        
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        classifier = pipeline(task='sentiment-analysis', model=model, tokenizer=tokenizer, device=0 if device == 'cuda' else -1)
        
        # Train Dataset: Classify texts and get predictions and scores
        results = classify_texts_in_batches(data['train']['text'])
        predictions, scores = zip(*results)

        train_df = data['train'].to_pandas()
        train_df['prediction'] = predictions
        train_df['score'] = scores
        
        # Test Dataset: Classify texts and get predictions and scores
        results = classify_texts_in_batches(data['test']['text'])
        predictions, scores = zip(*results)

        test_df = data['test'].to_pandas()
        test_df['prediction'] = predictions
        test_df['score'] = scores
        
        X_train = train_df[['category', 'subcategory', 'prediction', 'score']].copy()
        y_train = train_df['labels'].copy()

        X_test = test_df[['category', 'subcategory', 'prediction', 'score']].copy()
        y_test = test_df['labels'].copy()

        category_encoder = CustomLabelEncoder()
        subcategory_encoder = CustomLabelEncoder()
        prediction_encoder = CustomLabelEncoder()

        X_train['category'] = category_encoder.fit_transform(X_train['category'])
        X_test['category'] = category_encoder.transform(X_test['category'])

        X_train['subcategory'] = subcategory_encoder.fit_transform(X_train['subcategory'])
        X_test['subcategory'] = subcategory_encoder.transform(X_test['subcategory'])

        X_train['prediction'] = prediction_encoder.fit_transform(X_train['prediction'])
        X_test['prediction'] = prediction_encoder.transform(X_test['prediction'])

        # Train XGBoost
        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=seed)
        model.fit(X_train, y_train)

        predictions = model.predict(X_test)
        results = compute_metrics_two(y_test, predictions)
        results_xgboost.append(results)
        result_file.write(f"Fold {fold_number} Test Results XGBoost: {results}\n")

    # Calculate the average metric over all folds
    average_metrics = {key: np.mean([result[key] for result in results_bert]) for key in results_bert[0]}
    print("Average metrics across all folds BERT:", average_metrics, file=result_file)
    average_metrics = {key: np.mean([result[key] for result in results_xgboost]) for key in results_xgboost[0]}
    print("Average metrics across all folds XGBoost:", average_metrics, file=result_file)