In [None]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import re

import warnings
warnings.filterwarnings("ignore")

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import IsolationForest

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

In [None]:
df_train = df_train.drop(['id'], axis = 1)
df_test = df_test.drop(['id'], axis = 1)

In [None]:
# Install AutoGluon
!pip install ray==2.10.0
!pip install autogluon.tabular
!pip install ipywidgets

In [None]:
!conda install scikit-learn==1.2.2 --yes

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
# from autogluon.features.generators import FillNaFeatureGenerator

In [None]:
## Map target values: 'e' -> 1, 'p' -> 0
label = 'class'
# train_data[label] = train_data[label].map({'e': 1, 'p': 0})

# Train model with AutoGluon
predictor = TabularPredictor(
    label=label,
    eval_metric='mcc',
    problem_type='binary'
).fit(
    df_train,
    presets='best_quality',
    #feature_generator=feature_generator,  # Use the custom feature generator
    time_limit=3600*10,
    num_bag_folds=5,
    verbosity=3,
    excluded_model_types=['KNN'],
    num_cpus = 'auto',
    num_gpus = 'auto',
    num_stack_levels = 1,  # enables the stacking of models
)

'''
num_stack_level
Number of stacking levels in stack ensemble. 
Increases training time by approximately num_stack_levels+1. 
Default is 0 (disabled). Recommended: 1-3 levels for better predictive performance. 
Ensure num_bag_folds ≥ 2 to avoid overfitting, or a ValueError will occur.
'''

In [None]:
# Print fit summary
results = predictor.fit_summary()
print(results)

In [None]:
predictor.leaderboard()

In [None]:
# Feature importance
importances = predictor.feature_importance(df_train)
print("Top 20 feature importances:")
print(importances.head(20))

In [None]:
# Plot feature importances
plt.figure(figsize=(12, 10))
sns.barplot(
    x=importances['importance'],
    y=importances.index,
    palette='viridis'
)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
# Predict on test data
y_pred = predictor.predict(df_test)

In [None]:
df_test = pd.read_csv('test.csv')

# Create a submission DataFrame
submission = pd.DataFrame({
    'id': df_test['id'],
    'class': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission.csv', index=False)

# Display the first few rows of the predictions
print(submission.head(10))