# DSN Free AI Classes in Every City 2025
##  The Excellent Store Challenge
### Solution by: `IHEANYI, FAVOUR CHISOM`

##Import the Libraries

In [1]:
%pip install xgboost



In [2]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [16]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [4]:
# Setup rendom seed
np.random.seed(42)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# Load data
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
samp_sub = pd.read_csv('/content/drive/MyDrive/SampleSubmission.csv')

# Combine data for feature engineering
data = pd.concat([train, test], sort=False)

## Enhanced Feature Engineering
# Extract components from Item_Store_ID with more granularity
data[['Item_Code', 'Store_Code']] = data['Item_Store_ID'].str.split('_', expand=True)
data['Store_Cluster'] = data['Store_Code'].str.extract(r'([A-Za-z]+)')
data['Store_Number'] = data['Store_Code'].str.extract(r'(\d+)').astype(float)

# More sophisticated item features
data['Item_Prefix'] = data['Item_Code'].str.extract(r'^([A-Z]+)')
data['Item_Middle'] = data['Item_Code'].str.extract(r'[A-Z]+([0-9]+)')[0]
data['Item_Suffix'] = data['Item_Code'].str.extract(r'[0-9]+([A-Z]+)')
data['Item_Numeric'] = data['Item_Code'].str.extract(r'(\d+)').astype(float)

# Fill missing values for new features
data['Item_Middle'] = data['Item_Middle'].fillna(0).astype(int)
data['Item_Suffix'] = data['Item_Suffix'].fillna('None')

# Identify categorical and numerical features
categorical_cols = ['Item_Code', 'Store_Code', 'Store_Cluster', 'Item_Prefix', 'Item_Suffix', 'Item_Sugar_Content', 'Item_Type', 'Store_Size', 'Store_Location_Type', 'Store_Type']
numerical_cols = ['Item_Weight', 'Item_Visibility', 'Item_Price', 'Store_Start_Year', 'Store_Number', 'Item_Middle', 'Item_Numeric']

# Convert data types for new categorical columns
for col in ['Item_Sugar_Content', 'Item_Type', 'Store_Size', 'Store_Location_Type', 'Store_Type']:
    data[col] = data[col].astype('category')


# Now separate features and target
X = data.drop(["Item_Store_Returns", "Item_Store_ID"], axis=1)
y = data["Item_Store_Returns"]

# Split back into train and test
X_train = X[~y.isnull()].reset_index(drop=True)
y_train = y[~y.isnull()].reset_index(drop=True)
X_test = X[y.isnull()].reset_index(drop=True)

# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Use ColumnTransformer with OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# Apply the preprocessor
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)


# Get feature names after one-hot encoding
feature_names = numerical_cols + \
                list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols))


# Convert back to DataFrame (optional, but helpful for inspectio)
X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names)
X_val_processed = pd.DataFrame(X_val_processed, columns=feature_names)
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names)


# Get indices of categorical features for CatBoost (this is not needed anymore as we are using OneHotEncoder for CatBoost as well for consistency)
# cat_features_indices = [X_train.columns.get_loc(col) for col in categorical_cols]

# Reinitialize CatBoost with processed data
catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    eval_metric='RMSE',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50
)

# XGBoost Regressor
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50,
    eval_metric='rmse'
)

# Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

# Gradient Boosting Regressor
gb = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    min_samples_split=5,
    random_state=42
)

# Stacked model using Voting Regressor with XGBoost
stacked_model = VotingRegressor(
    estimators=[
        ('catboost', catboost),
        ('xgb', xgb),
        ('random_forest', rf),
        ('gradient_boost', gb)
    ],
    weights=[0.4, 0.3, 0.15, 0.15]
)

# Function to evaluate model
def evaluate_model(model, X_val, y_val):
    predictions = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, predictions))
    print(f"RMSE: {rmse}")
    return rmse

# Fit models with proper data
print("Training CatBoost...")
catboost.fit(X_train_processed, y_train, eval_set=(X_val_processed, y_val))

print("\nTraining XGBoost...")
xgb.fit(X_train_processed, y_train, eval_set=[(X_val_processed, y_val)], verbose=100)

print("\nTraining Random Forest...")
rf.fit(X_train_processed, y_train)

print("\nTraining Gradient Boosting...")
gb.fit(X_train_processed, y_train)

print("\nTraining Stacked Model...")
# For stacked model, we need to use the encoded version for non-CatBoost models
# This requires a bit more work - we'll need to create separate pipelines

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# No longer needed with OneHotEncoder
# class ColumnSelector(BaseEstimator, TransformerMixin):
#     def __init__(self, columns):
#         self.columns = columns

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return X[self.columns]

# For simplicity, let's proceed with the encoded data for stacking
stacked_model.fit(X_train_processed, y_train)

# Evaluation
print("\nModel Performance Comparison:")
print("CatBoost:")
cb_rmse = evaluate_model(catboost, X_val_processed, y_val)

print("\nXGBoost:")
xgb_rmse = evaluate_model(xgb, X_val_processed, y_val)

print("\nRandom Forest:")
rf_rmse = evaluate_model(rf, X_val_processed, y_val)

print("\nGradient Boosting:")
gb_rmse = evaluate_model(gb, X_val_processed, y_val)

print("\nStacked Model:")
stacked_rmse = evaluate_model(stacked_model, X_val_processed, y_val)

# Generate predictions using CatBoost (best model in our case)
test_preds = catboost.predict(X_test_processed)

# Prepare submission
submission = samp_sub.copy()
submission['Item_Store_Returns'] = test_preds

# Save submission
submission.to_csv('final_submission.csv', index=False)
print("\nFinal submission file saved as 'final_submission.csv'")

ValueError: y contains previously unseen labels: 'NCH30'