# DSN Free AI Classes in Every City 2025
##  The Excellent Store Challenge
### Solution by: `IHEANYI, FAVOUR CHISOM`

##Import the Libraries

In [1]:
%pip install xgboost



In [2]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [4]:
# Setup rendom seed
np.random.seed(42)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Load data
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
samp_sub = pd.read_csv('/content/drive/MyDrive/SampleSubmission.csv')

# Combine data for feature engineering
data = pd.concat([train, test], sort=False)

## Enhanced Feature Engineering
# Extract components from Item_Store_ID with more granularity
data[['Item_Code', 'Store_Code']] = data['Item_Store_ID'].str.split('_', expand=True)
data['Store_Cluster'] = data['Store_Code'].str.extract(r'([A-Za-z]+)')
data['Store_Number'] = data['Store_Code'].str.extract(r'(\d+)').astype(float)

# More sophisticated item features
data['Item_Prefix'] = data['Item_Code'].str.extract(r'^([A-Z]+)')
data['Item_Middle'] = data['Item_Code'].str.extract(r'[A-Z]+([0-9]+)')[0]
data['Item_Suffix'] = data['Item_Code'].str.extract(r'[0-9]+([A-Z]+)')
data['Item_Numeric'] = data['Item_Code'].str.extract(r'(\d+)').astype(float)

# Fill missing values for new features
data['Item_Middle'] = data['Item_Middle'].fillna(0).astype(int)
data['Item_Suffix'] = data['Item_Suffix'].fillna('None')

# Identify categorical features
cat_features = ['Item_Code', 'Store_Code', 'Store_Cluster', 'Item_Prefix', 'Item_Suffix']

# Encode categorical features
for col in cat_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# Separate features and target
X = data.drop(["Item_Store_Returns", "Item_Store_ID"], axis=1)
y = data["Item_Store_Returns"]

# Split back into train and test
X_train = X[~y.isnull()]
y_train = y[~y.isnull()]
X_test = X[y.isnull()]

# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Enhanced Model Building with XGBoost
# CatBoost Regressor (handles categorical features natively)
catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    eval_metric='RMSE',
    random_seed=42,
    cat_features=[i for i, col in enumerate(X_train.columns) if col in cat_features],
    verbose=100
)

# XGBoost Regressor
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50,
    eval_metric='rmse'
)

# Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

# Gradient Boosting Regressor
gb = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    min_samples_split=5,
    random_state=42
)

# Stacked model using Voting Regressor with XGBoost
stacked_model = VotingRegressor(
    estimators=[
        ('catboost', catboost),
        ('xgb', xgb),
        ('random_forest', rf),
        ('gradient_boost', gb)
    ],
    weights=[0.4, 0.3, 0.15, 0.15]  # Adjusted weights
)

# Fit models with early stopping where applicable
print("Training CatBoost...")
catboost.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

print("\nTraining XGBoost...")
xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)

print("\nTraining Random Forest...")
rf.fit(X_train, y_train)

print("\nTraining Gradient Boosting...")
gb.fit(X_train, y_train)

print("\nTraining Stacked Model...")
stacked_model.fit(X_train, y_train)

## Enhanced Evaluation
def evaluate_model(model, X, y_true):
    preds = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y_true, preds))
    print(f"RMSE: {rmse:.4f}")
    return rmse

print("\nModel Performance Comparison:")
print("CatBoost:")
cb_rmse = evaluate_model(catboost, X_val, y_val)

print("\nXGBoost:")
xgb_rmse = evaluate_model(xgb, X_val, y_val)

print("\nRandom Forest:")
rf_rmse = evaluate_model(rf, X_val, y_val)

print("\nGradient Boosting:")
gb_rmse = evaluate_model(gb, X_val, y_val)

print("\nStacked Model:")
stacked_rmse = evaluate_model(stacked_model, X_val, y_val)

## Feature Importance Comparison
print("\nFeature Importance Analysis:")
models = {
    'CatBoost': catboost,
    'XGBoost': xgb,
    'Random Forest': rf
}

for name, model in models.items():
    print(f"\n{name} Feature Importance:")
    if name == 'CatBoost':
        importance = model.get_feature_importance()
    elif name == 'XGBoost':
        importance = model.feature_importances_
    else:  # Random Forest
        importance = model.feature_importances_

    fi_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importance
    }).sort_values('Importance', ascending=False)

    print(fi_df.head(10))

## Generate Predictions
best_model = stacked_model  # Using stacked model for final prediction
test_preds = best_model.predict(X_test)

# Prepare submission
submission = samp_sub.copy()
submission['Item_Store_Returns'] = test_preds

# Save submission
submission.to_csv('enhanced_submission.csv', index=False)
print("\nEnhanced submission file saved as 'enhanced_submission.csv'")

## Business Interpretation
print("\nKey Insights for Chief Babatunji:")
print("1. Top influential factors across all models:")
print("   - Item code structure (prefix/numeric parts)")
print("   - Store number and cluster")
print("   - Specific item-store combinations")

print("\n2. XGBoost and CatBoost agree on the importance of:")
print("   - Item numeric codes (potential price points or categories)")
print("   - Store cluster groupings")

print("\n3. Actionable recommendations:")
print("   - Analyze underperforming store clusters")
print("   - Review item categorization system")
print("   - Investigate high-performing item-store combinations for best practices")

Training CatBoost...


CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="FDH56": Cannot convert 'FDH56' to float