## Imports and Definitions ##

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pickle

class CustomLabelEncoder:
    def __init__(self):
        self.encoder = LabelEncoder()
        self.classes_ = None

    def fit(self, y):
        self.encoder.fit(y)
        self.classes_ = self.encoder.classes_
        return self

    def transform(self, y):
        new_labels = set(y) - set(self.encoder.classes_)
        if new_labels:
            for label in new_labels:
                self.classes_ = np.append(self.classes_, label)
            self.encoder.classes_ = self.classes_
        return self.encoder.transform(y)

    def fit_transform(self, y):
        return self.fit(y).transform(y)

    def inverse_transform(self, y):
        return self.encoder.inverse_transform(y)

classifier_type = 'product'
# Note: Use a dataset with results from BERT (prediction and score)
dataset_path = f'/scratch/data_processed_{classifier_type}_with_predictions.csv' 
seed = 3

## Load the Dataset, Encode Variables and Split the Data ##

In [None]:
df = pd.read_csv(dataset_path)
df = df.dropna(subset=[classifier_type])
df = df.rename(columns={classifier_type: 'labels'})

# Filter out classes with counts less than 3
label_counts = df['labels'].value_counts()
classes_to_keep = label_counts[label_counts > 2].index.tolist()
df = df[df['labels'].isin(classes_to_keep)]

# Convert textual labels to unique integers
unique_labels = df['labels'].unique()
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
df['labels'] = df['labels'].map(label_to_id)

train_df, remaining_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=seed)
test_df, validation_df = train_test_split(remaining_df, test_size=0.5, stratify=remaining_df['labels'], random_state=seed)

X_train = train_df[['category', 'subcategory', 'prediction', 'score']].copy()
y_train = train_df['labels'].copy()

X_test = test_df[['category', 'subcategory', 'prediction', 'score']].copy()
y_test = test_df['labels'].copy()

category_encoder = CustomLabelEncoder()
subcategory_encoder = CustomLabelEncoder()
prediction_encoder = CustomLabelEncoder()

# Fit label encoders on training data only and transform both training and test data
X_train.loc[:, 'category'] = category_encoder.fit_transform(X_train['category'])
X_test.loc[:, 'category'] = category_encoder.transform(X_test['category'])

X_train.loc[:, 'subcategory'] = subcategory_encoder.fit_transform(X_train['subcategory'])
X_test.loc[:, 'subcategory'] = subcategory_encoder.transform(X_test['subcategory'])

X_train.loc[:, 'prediction'] = prediction_encoder.fit_transform(X_train['prediction'])
X_test.loc[:, 'prediction'] = prediction_encoder.transform(X_test['prediction'])

## Train the XGBoost Classifier ##

In [None]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=seed)
model.fit(X_train, y_train)

## Load Classifier ##

In [None]:
with open('/scratch/xgboost.pkl', 'rb') as f:
    model = pickle.load(f)

## Create Confusion Matrix ##

In [None]:
predictions = model.predict(X_test)

cm = confusion_matrix(y_test, predictions)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

class_labels = unique_labels

fig, ax = plt.subplots(figsize=(10, 10))
cax = ax.matshow(cm_normalized, cmap=plt.cm.Blues)

cbar = fig.colorbar(cax, ax=ax, shrink=0.8)

ax.set_xticks(np.arange(len(class_labels)))
ax.set_yticks(np.arange(len(class_labels)))
ax.set_xticklabels(class_labels, rotation=90)
ax.set_yticklabels(class_labels)

plt.xlabel('Predicted Class')
plt.ylabel('True Class')

ax.xaxis.set_tick_params(labeltop=False, labelbottom=True)
ax.tick_params(top=False, right=False)

for (i, j), val in np.ndenumerate(cm):
    text_color = 'white' if cm_normalized[i, j] > 0.65 else 'black'
    ax.text(j, i, f'{cm_normalized[i, j]:.2f}\n{val}', ha='center', va='center', color=text_color)

plt.show()

In [None]:
'''unique_labels = ['Sonstiges', 'SAP Business Warehouse', 'GBI',
       'IDES', 'Entwicklungssystem', 'HANA', 'ERPsim', 'TS410',
       'Business by Design', 'UCC Portal', 'Lumira', 'SAP4School',
       'GBS', 'Celonis']

unique_labels = ['SAP Business Warehouse', 'Global Bike',
       'Entwicklungssystem', 'HANA', 'ERPsim', 'TS410',
       'Business by Design', 'UCC Portal', 'SAP4School']'''

## Print Classification Report ###

In [None]:
y_test_original = np.vectorize(id_to_label.get)(y_test)
predictions_original = np.vectorize(id_to_label.get)(predictions)

print(classification_report(y_test_original, predictions_original))