## Imports and Definitions ##

In [181]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pickle
from lime import lime_tabular
import os

class CustomLabelEncoder:
    def __init__(self):
        self.encoder = LabelEncoder()
        self.classes_ = None

    def fit(self, y):
        self.encoder.fit(y)
        self.classes_ = self.encoder.classes_
        return self

    def transform(self, y):
        new_labels = set(y) - set(self.encoder.classes_)
        if new_labels:
            for label in new_labels:
                self.classes_ = np.append(self.classes_, label)
            self.encoder.classes_ = self.classes_
        return self.encoder.transform(y)

    def fit_transform(self, y):
        return self.fit(y).transform(y)

    def inverse_transform(self, y):
        return self.encoder.inverse_transform(y)

classifier_type = 'product'
dataset_path = f'/scratch/data_processed_{classifier_type}_with_predictions.csv'
seed = 3
p_label = 'Sonstiges' # Predicted
t_label = 'Celonis' # True
with open('/scratch/xgboost.pkl', 'rb') as f:
    model = pickle.load(f)

## Load the Dataset, Encode Variables and Split the Data ##

In [182]:
df = pd.read_csv(dataset_path)
df = df.dropna(subset=[classifier_type])
df = df.rename(columns={classifier_type: 'labels'})

# Filter out classes with counts less than 3
label_counts = df['labels'].value_counts()
classes_to_keep = label_counts[label_counts > 2].index.tolist()
df = df[df['labels'].isin(classes_to_keep)]

# Convert textual labels to unique integers
unique_labels = df['labels'].unique()
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
df['labels'] = df['labels'].map(label_to_id)

train_df, remaining_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=seed)
test_df, validation_df = train_test_split(remaining_df, test_size=0.5, stratify=remaining_df['labels'], random_state=seed)

X_train = train_df[['category', 'subcategory', 'prediction', 'score']].copy()
y_train = train_df['labels'].copy()

X_test = test_df[['category', 'subcategory', 'prediction', 'score']].copy()
y_test = test_df['labels'].copy()

category_encoder = CustomLabelEncoder()
subcategory_encoder = CustomLabelEncoder()
prediction_encoder = CustomLabelEncoder()

# Fit label encoders on training data only and transform both training and test data
X_train.loc[:, 'category'] = category_encoder.fit_transform(X_train['category'])
X_test.loc[:, 'category'] = category_encoder.transform(X_test['category'])

X_train.loc[:, 'subcategory'] = subcategory_encoder.fit_transform(X_train['subcategory'])
X_test.loc[:, 'subcategory'] = subcategory_encoder.transform(X_test['subcategory'])

X_train.loc[:, 'prediction'] = prediction_encoder.fit_transform(X_train['prediction'])
X_test.loc[:, 'prediction'] = prediction_encoder.transform(X_test['prediction'])

## Get Misclassified Instances ##

In [183]:
misclassified = []

predictions = model.predict(X_test)

for i, (prediction, true_label) in enumerate(zip(predictions, y_test)):
    if (prediction == label_to_id[p_label] and true_label == label_to_id[t_label]):
        misclassified.append((y_test.index[i], df.loc[y_test.index[i], 'id']))

## Create LIME Explanations ##

In [184]:
p_label_sanitized = p_label.replace('/', '_')
t_label_sanitized = t_label.replace('/', '_')
folder_path = f'/scratch/LIME_XGBoost/P_{p_label_sanitized}_T_{t_label_sanitized}/' # Save explanations to path

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
categorical_feature_names = ['prediction', 'category', 'subcategory']
categorical_features_indices = [X_train.columns.get_loc(name) for name in categorical_feature_names]

categorical_names = {
    X_train.columns.get_loc('prediction'): {i: name for i, name in enumerate(prediction_encoder.classes_)},
    X_train.columns.get_loc('category'): {i: name for i, name in enumerate(category_encoder.classes_)},
    X_train.columns.get_loc('subcategory'): {i: name for i, name in enumerate(subcategory_encoder.classes_)}
}

explainer = lime_tabular.LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns.tolist(),
    class_names=unique_labels,
    categorical_features=categorical_features_indices,
    categorical_names=categorical_names,
    mode='classification'
)

for index, ticket_id in misclassified:
    instance = X_test.loc[index].values.reshape(1, -1)
    exp = explainer.explain_instance(
        data_row=X_test.loc[index].values,
        predict_fn=model.predict_proba,
        num_features=10,
        labels=[model.predict(instance)[0]]
    )
    
    file_path = f'{folder_path}explanation_{ticket_id}.html'
    exp.save_to_file(file_path)