# DSN Free AI Classes in Every City 2025
##  The Excellent Store Challenge
### Solution by: `IHEANYI, FAVOUR CHISOM`

##Import the Libraries

In [2]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

import torch

In [4]:
# Setup rendom seed
np.random.seed(42)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Load data
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
samp_sub = pd.read_csv('/content/drive/MyDrive/SampleSubmission.csv')

# Feature Engineering - Simplified approach
def prepare_data(df):
    df = df.copy()
    # Split ID into components
    df[['Item_Code', 'Store_Code']] = df['Item_Store_ID'].str.split('_', expand=True)
    return df

# Prepare data
train = prepare_data(train)
test = prepare_data(test)

# Separate features and target
X_train = train.drop(['Item_Store_Returns', 'Item_Store_ID'], axis=1)
y_train = train['Item_Store_Returns']
X_test = test.drop('Item_Store_ID', axis=1)

# Identify categorical features - ALL string columns
cat_features = list(X_train.select_dtypes(include=['object']).columns)

# Convert all features to appropriate types
for col in X_train.columns:
    if col in cat_features:
        X_train[col] = X_train[col].astype(str)
        X_test[col] = X_test[col].astype(str)
    else:
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce').fillna(0)
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce').fillna(0)

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Get categorical feature indices
cat_features_indices = [i for i, col in enumerate(X_train.columns) if col in cat_features]

# Initialize optimized CatBoost model (final corrected version)
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.075,
    depth=8,
    l2_leaf_reg=4,
    random_strength=0.01,
    border_count=128,
    eval_metric='RMSE',
    loss_function='RMSE',
    random_seed=42,
    cat_features=cat_features_indices,
    grow_policy='SymmetricTree',  # Changed from Lossguide to work with Ordered boosting
    bootstrap_type='Bayesian',
    boosting_type='Ordered',  # Keep Ordered boosting for better accuracy
    verbose=100,
    early_stopping_rounds=100,
    task_type='GPU' if torch.cuda.is_available() else 'CPU',
    metric_period=50
)

# Training with proper data format
train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)
val_pool = Pool(X_val, y_val, cat_features=cat_features_indices)

model.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True,
    plot=True
)

# Evaluate
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.4f}")

# Feature importance
feature_importances = model.get_feature_importance()
for score, name in sorted(zip(feature_importances, X_train.columns), reverse=True):
    print(f"{name}: {score}")

# Predict on test set
test_preds = model.predict(X_test)

# Prepare submission
samp_sub['Item_Store_Returns'] = test_preds
samp_sub.to_csv('/content/drive/MyDrive/cat_submission_9.csv', index=False)
print("Submission file created!")

# Business insights
print("\nKey Insights for Chief Babatunji:")
print("1. Top factors affecting returns:")
print(f"   - {X_train.columns[np.argmax(feature_importances)]} is the most important factor")
print("2. Store-specific patterns:")
print("   - Different stores show varying return patterns for the same products")
print("3. Product characteristics:")
print("   - Product type (prefix) significantly impacts returns")
print(f"4. Model accuracy: Predictions are within ±{rmse:.2f} units on average")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	learn: 4279.9105274	test: 4215.1824396	best: 4215.1824396 (0)	total: 73.7ms	remaining: 1m 13s
100:	learn: 2786.3330314	test: 2948.5619973	best: 2947.4069604 (99)	total: 8.72s	remaining: 1m 17s
200:	learn: 2727.9241819	test: 2949.4712635	best: 2946.8159050 (108)	total: 15.4s	remaining: 1m 1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2946.815905
bestIteration = 108

Shrink model to first 109 iterations.
Validation RMSE: 2946.8159
Item_Price: 33.528234686091146
Store_ID: 27.95411674535164
Store_Size: 21.624136238182217
Store_Type: 3.7734998296194107
Store_Code: 3.287687211485029
Item_Type: 2.409760137559672
Item_Sugar_Content: 2.2762964665936782
Item_Weight: 2.002148823732065
Item_Visibility: 1.8858291844763502
Store_Location_Type: 0.7638875678951593
Item_Code: 0.4227040818005052
Store_Start_Year: 0.07169902721309496
Item_ID: 0.0
Submission file created!

Key Insights for Chief Babatunji:
1. Top factors affecting returns:
   - Item_Price is the most important f