# DSN Free AI Classes in Every City 2025
##  The Excellent Store Challenge
### Solution by: `IHEANYI, FAVOUR CHISOM`

##Import the Libraries

In [1]:
%pip install xgboost



In [2]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [11]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [4]:
# Setup rendom seed
np.random.seed(42)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
# Load data
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
samp_sub = pd.read_csv('/content/drive/MyDrive/SampleSubmission.csv')

# Combine data for feature engineering
data = pd.concat([train, test], sort=False)

## Enhanced Feature Engineering
# Extract components from Item_Store_ID with more granularity
data[['Item_Code', 'Store_Code']] = data['Item_Store_ID'].str.split('_', expand=True)
data['Store_Cluster'] = data['Store_Code'].str.extract(r'([A-Za-z]+)')
data['Store_Number'] = data['Store_Code'].str.extract(r'(\d+)').astype(float)

# More sophisticated item features
data['Item_Prefix'] = data['Item_Code'].str.extract(r'^([A-Z]+)')
data['Item_Middle'] = data['Item_Code'].str.extract(r'[A-Z]+([0-9]+)')[0]
data['Item_Suffix'] = data['Item_Code'].str.extract(r'[0-9]+([A-Z]+)')
data['Item_Numeric'] = data['Item_Code'].str.extract(r'(\d+)').astype(float)

# Fill missing values for new features
data['Item_Middle'] = data['Item_Middle'].fillna(0).astype(int)
data['Item_Suffix'] = data['Item_Suffix'].fillna('None')

# Identify categorical and numerical features
categorical_cols = ['Item_Code', 'Store_Code', 'Store_Cluster', 'Item_Prefix', 'Item_Suffix']
numerical_cols = ['Store_Number', 'Item_Middle', 'Item_Numeric']

# Convert data types
for col in categorical_cols:
    data[col] = data[col].astype('category')
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)

# Now separate features and target
X = data.drop(["Item_Store_Returns", "Item_Store_ID"], axis=1)
y = data["Item_Store_Returns"]

# Split back into train and test
X_train = X[~y.isnull()]
y_train = y[~y.isnull()]
X_test = X[y.isnull()]

# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Get indices of categorical features for CatBoost
cat_features_indices = [X_train.columns.get_loc(col) for col in categorical_cols]

# For non-CatBoost models, we'll use label encoding instead of one-hot to avoid dimension explosion
from sklearn.preprocessing import LabelEncoder

X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()
X_test_encoded = X_test.copy()

for col in categorical_cols:
    le = LabelEncoder()
    # Fit on train and transform all sets
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col].astype(str))
    X_val_encoded[col] = le.transform(X_val_encoded[col].astype(str))
    X_test_encoded[col] = le.transform(X_test_encoded[col].astype(str))

# Reinitialize CatBoost with proper categorical features
catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    eval_metric='RMSE',
    random_seed=42,
    cat_features=cat_features_indices,
    verbose=100
)

# XGBoost Regressor
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50,
    eval_metric='rmse'
)

# Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

# Gradient Boosting Regressor
gb = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    min_samples_split=5,
    random_state=42
)

# Stacked model using Voting Regressor with XGBoost
stacked_model = VotingRegressor(
    estimators=[
        ('catboost', catboost),
        ('xgb', xgb),
        ('random_forest', rf),
        ('gradient_boost', gb)
    ],
    weights=[0.4, 0.3, 0.15, 0.15]
)

# Fit models with proper data
print("Training CatBoost...")
catboost.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

print("\nTraining XGBoost...")
xgb.fit(X_train_encoded, y_train, eval_set=[(X_val_encoded, y_val)], verbose=100)

print("\nTraining Random Forest...")
rf.fit(X_train_encoded, y_train)

print("\nTraining Gradient Boosting...")
gb.fit(X_train_encoded, y_train)

print("\nTraining Stacked Model...")
# For stacked model, we need to use the encoded version for non-CatBoost models
# This requires a bit more work - we'll need to create separate pipelines

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

# Create individual model pipelines
catboost_pipe = Pipeline([
    ('selector', ColumnSelector(X_train.columns.tolist())),
    ('model', catboost)
])

xgb_pipe = Pipeline([
    ('encoder', LabelEncoderWrapper(categorical_cols)),  # Custom encoder
    ('model', xgb)
])

# Similarly for other models...

# Alternatively, we can train the stacked model using the encoded data
# and handle CatBoost separately in production

# For simplicity, let's proceed with the encoded data for stacking
# Note: This means CatBoost won't get its categorical features in the stack
# For a proper implementation, we'd need a more sophisticated approach

stacked_model.fit(X_train_encoded, y_train)

# Evaluation
print("\nModel Performance Comparison:")
print("CatBoost:")
cb_rmse = evaluate_model(catboost, X_val, y_val)

print("\nXGBoost:")
xgb_rmse = evaluate_model(xgb, X_val_encoded, y_val)

print("\nRandom Forest:")
rf_rmse = evaluate_model(rf, X_val_encoded, y_val)

print("\nGradient Boosting:")
gb_rmse = evaluate_model(gb, X_val_encoded, y_val)

print("\nStacked Model:")
stacked_rmse = evaluate_model(stacked_model, X_val_encoded, y_val)

# For final predictions, we need to use the appropriate data format for each model
# This would be more complex in a real deployment

# Generate predictions using CatBoost (best model in our case)
test_preds = catboost.predict(X_test)

# Prepare submission
submission = samp_sub.copy()
submission['Item_Store_Returns'] = test_preds

# Save submission
submission.to_csv('final_submission.csv', index=False)
print("\nFinal submission file saved as 'final_submission.csv'")

ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.