In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ==========================================================
# KAGGLE ML EXAM – MULTI-CLASS CLASSIFICATION TEMPLATE
# Random Forest (End-to-End)
# ==========================================================

# =====================
# 1. IMPORT LIBRARIES
# =====================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

import warnings
warnings.filterwarnings('ignore')

# =====================
# 2. LOAD DATA
# =====================
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

TARGET_COL = 'target'   # CHANGE IF REQUIRED

# =====================
# 3. EDA & VISUALIZATION (BEFORE CLEANING)
# =====================
print(train_df.info())
print(train_df.describe())

# Target distribution (multi-class)
plt.figure(figsize=(6,4))
sns.countplot(x=train_df[TARGET_COL])
plt.title('Target Distribution (Before Cleaning)')
plt.show()

# Boxplots for numeric features
num_cols = train_df.drop(TARGET_COL, axis=1).select_dtypes(include=np.number).columns

for col in num_cols:
    plt.figure(figsize=(4,2))
    sns.boxplot(x=train_df[col])
    plt.title(f'Boxplot of {col} (Before Cleaning)')
    plt.show()

# =====================
# 4. HANDLE CATEGORICAL FEATURES
# =====================
label_encoders = {}

for col in train_df.select_dtypes(include='object').columns:
    if col != TARGET_COL:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col].astype(str))
        test_df[col]  = le.transform(test_df[col].astype(str))
        label_encoders[col] = le

# Encode target (mandatory for multi-class)
target_encoder = LabelEncoder()
train_df[TARGET_COL] = target_encoder.fit_transform(train_df[TARGET_COL])

# =====================
# 5. HANDLE MISSING VALUES
# =====================
imputer = SimpleImputer(strategy='median')
train_df[num_cols] = imputer.fit_transform(train_df[num_cols])
test_df[num_cols]  = imputer.transform(test_df[num_cols])

# =====================
# 6. REMOVE OUTLIERS (IQR METHOD)
# =====================
def iqr_clip(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = np.clip(df[col], lower, upper)
    return df

train_df = iqr_clip(train_df, num_cols)

# =====================
# 7. VISUALIZATION (AFTER CLEANING)
# =====================
plt.figure(figsize=(6,4))
sns.countplot(x=train_df[TARGET_COL])
plt.title('Target Distribution (After Cleaning)')
plt.show()

for col in num_cols:
    plt.figure(figsize=(4,2))
    sns.boxplot(x=train_df[col])
    plt.title(f'Boxplot of {col} (After Cleaning)')
    plt.show()

# =====================
# 8. TRAIN–VALIDATION SPLIT
# =====================
X = train_df.drop(TARGET_COL, axis=1)
y = train_df[TARGET_COL]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =====================
# 9. SCALING
# =====================
# Not required for Random Forest (tree-based model)

# =====================
# 10. HYPERPARAMETER TUNING (GRIDSEARCH)
# =====================
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None, 15, 25],
    'min_samples_split': [2, 5]
}

rf_base = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

grid = GridSearchCV(
    rf_base,
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print('Best Parameters:', grid.best_params_)

# =====================
# 11. MODEL EVALUATION
# =====================
y_val_pred = best_model.predict(X_val)

print('Validation Accuracy:', accuracy_score(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# =====================
# 12. SAVE MODEL
# =====================
joblib.dump(best_model, 'rf_multiclass_model.pkl')

# =====================
# 13. TEST PREDICTION & SUBMISSION
# =====================
# Handle ID column safely (if present)
if 'id' in test_df.columns:
    test_ids = test_df['id']
    test_df = test_df.drop('id', axis=1)
else:
    test_ids = test_df.index

test_preds = best_model.predict(test_df)

submission = pd.DataFrame({
    'id': test_ids,
    TARGET_COL: test_preds
})

submission.to_csv('submission_multiclass.csv', index=False)
print('submission_multiclass.csv created')

# ==========================================================
# END – MULTI-CLASSIFICATION (EXAM READY)
# ==========================================================