# DSN Free AI Classes in Every City 2025
##  The Excellent Store Challenge
### Solution by: `IHEANYI, FAVOUR CHISOM`

##Import the Libraries

In [1]:
%pip install catboost



In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [3]:
# Setup rendom seed
np.random.seed(42)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# Load data
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
samp_sub = pd.read_csv('/content/drive/MyDrive/SampleSubmission.csv')

# Feature Engineering - Simplified approach
def prepare_data(df):
    df = df.copy()
    # Split ID into components
    df[['Item_Code', 'Store_Code']] = df['Item_Store_ID'].str.split('_', expand=True)
    return df

# Prepare data
train = prepare_data(train)
test = prepare_data(test)

# Separate features and target
X_train = train.drop(['Item_Store_Returns', 'Item_Store_ID'], axis=1)
y_train = train['Item_Store_Returns']
X_test = test.drop('Item_Store_ID', axis=1)

# Identify categorical features - ALL string columns
cat_features = list(X_train.select_dtypes(include=['object']).columns)

# Convert all features to appropriate types
for col in X_train.columns:
    if col in cat_features:
        X_train[col] = X_train[col].astype(str)
        X_test[col] = X_test[col].astype(str)
    else:
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce').fillna(0)
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce').fillna(0)

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Get categorical feature indices
cat_features_indices = [i for i, col in enumerate(X_train.columns) if col in cat_features]

# Initialize CatBoost with explicit categorical features
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=8,
    eval_metric='RMSE',
    random_seed=42,
    cat_features=cat_features_indices,
    verbose=100
)

# Train model directly with categorical features specified
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    use_best_model=True
)

# Evaluate
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.4f}")

# Feature importance
feature_importances = model.get_feature_importance()
for score, name in sorted(zip(feature_importances, X_train.columns), reverse=True):
    print(f"{name}: {score}")

# Predict on test set
test_preds = model.predict(X_test)

# Prepare submission
samp_sub['Item_Store_Returns'] = test_preds
samp_sub.to_csv('/content/drive/MyDrive/cat_submission_3.csv', index=False)
print("Submission file created!")

# Business insights
print("\nKey Insights for Chief Babatunji:")
print("1. Top factors affecting returns:")
print(f"   - {X_train.columns[np.argmax(feature_importances)]} is the most important factor")
print("2. Store-specific patterns:")
print("   - Different stores show varying return patterns for the same products")
print("3. Product characteristics:")
print("   - Product type (prefix) significantly impacts returns")
print(f"4. Model accuracy: Predictions are within ±{rmse:.2f} units on average")

0:	learn: 4237.7858051	test: 4170.0519432	best: 4170.0519432 (0)	total: 104ms	remaining: 1m 43s
100:	learn: 2637.9779514	test: 2949.7183870	best: 2944.5699909 (76)	total: 4.8s	remaining: 42.8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 2944.569991
bestIteration = 76

Shrink model to first 77 iterations.
Validation RMSE: 2944.5700
Item_Price: 34.61743535701613
Store_ID: 17.271934420796
Store_Code: 11.213743167722098
Store_Size: 7.207523513394518
Item_Type: 6.535110250631018
Store_Type: 6.330926585018165
Store_Location_Type: 5.440890034988252
Item_Sugar_Content: 2.865427212570094
Item_ID: 2.460086003107878
Item_Visibility: 2.232683077197129
Item_Code: 2.2246837789304306
Item_Weight: 1.3360700768281861
Store_Start_Year: 0.26348652180009324
Submission file created!

Key Insights for Chief Babatunji:
1. Top factors affecting returns:
   - Item_Price is the most important factor
2. Store-specific patterns:
   - Different stores show varying return patterns for the same