In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import log_loss, confusion_matrix, classification_report

# Load the top 400 features (for reference)
top_400_features = pd.read_csv('../dataset_xgboost/train/top_400_features.csv')['Feature'].tolist()

# Load the train and test datasets
train_data = pd.read_csv('../dataset_xgboost/train/train_merged.csv')
test_data = pd.read_csv('../dataset_xgboost/test/test_merged.csv')

# Filter the datasets to include only the top 400 features, 'ID', and 'Class'
train_data_filtered = train_data[['ID', 'Class'] + top_400_features]
test_data_filtered = test_data[['ID', 'Class'] + top_400_features]

# Separate features and target
X_train = train_data_filtered.drop(columns=['ID', 'Class'])
y_train = train_data_filtered['Class']

X_test = test_data_filtered.drop(columns=['ID', 'Class'])
y_test = test_data_filtered['Class']

# Re-map class labels to ensure they are continuous integers starting from 0
unique_classes = sorted(y_train.unique())
class_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_classes)}
print(f"Class mapping: {class_mapping}")

# Apply the class mapping to the target columns
y_train = y_train.map(class_mapping)
y_test = y_test.map(class_mapping)

# Train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

# Predict probabilities and classes for the test set
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)

# Calculate the mlogloss
mlogloss = log_loss(y_test, y_pred_proba)
print(f"Multiclass Log Loss: {mlogloss}")

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Display a classification report for additional metrics (optional)
class_report = classification_report(y_test, y_pred, target_names=[str(c) for c in unique_classes])
print("Classification Report:")
print(class_report)
