# DSN Free AI Classes in Every City 2025
##  The Excellent Store Challenge
### Solution by: `IHEANYI, FAVOUR CHISOM`

##Import the Libraries

In [1]:
%pip install catboost



In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [8]:
print(sklearn.__version__)

1.6.1


In [9]:
# Setup rendom seed
np.random.seed(42)

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load data
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
samp_sub = pd.read_csv('/content/drive/MyDrive/SampleSubmission.csv')

# Combine train and test for feature engineering (excluding target)
data = pd.concat([train.drop("Item_Store_Returns", axis=1), test], sort=False)

# Feature Engineering
# Extract components from Item_Store_ID
data[['Item_Code', 'Store_Code']] = data['Item_Store_ID'].str.split('_', expand=True)

# Extract store number from Store_Code
data['Store_Number'] = data['Store_Code'].str.extract('(\d+)').astype(int)

# Extract item prefix and numbers
data['Item_Prefix'] = data['Item_Code'].str.extract('([A-Z]+)')
data['Item_Number'] = data['Item_Code'].str.extract('(\d+)').astype(float)

# Label encoding for categorical features
cat_features = ['Item_Prefix', 'Store_Code']
for feature in cat_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])

# Split back into train and test
train_features = data.iloc[:len(train)]
test_features = data.iloc[len(train):]

# Prepare target
y = train['Item_Store_Returns']

# Split train data for validation
X_train, X_val, y_train, y_val = train_test_split(
    train_features, y, test_size=0.2, random_state=42
)

# Identify categorical features indices
cat_features_indices = [train_features.columns.get_loc(col) for col in cat_features]

# Create CatBoost pools
train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)
val_pool = Pool(X_val, y_val, cat_features=cat_features_indices)

# Initialize and train CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='RMSE',
    random_seed=42,
    verbose=100
)

model.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=50,
    use_best_model=True
)

# Make predictions on validation set
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.4f}")

# Feature importance
feature_importances = model.get_feature_importance()
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f"{name}: {score}")

# Make predictions on test set
test_preds = model.predict(test_features)

# Prepare submission
samp_sub['Item_Store_Returns'] = test_preds
samp_sub.to_csv('submission.csv', index=False)
print("Submission file created!")

# Explain to Chief Babatunji
print("\nKey Insights for Chief Babatunji:")
print("1. The model shows that returns vary based on:")
print("   - Store location (Store_Code and Store_Number are important features)")
print("   - Product type (Item_Prefix has high importance)")
print("2. The same product yields different returns because:")
print("   - Different stores have different customer demographics")
print("   - Store size and age likely affect product performance")
print("   - Regional preferences impact product popularity")
print(f"3. Our model achieved RMSE of {rmse:.2f}, meaning we can predict returns within ±{rmse:.2f} units on average")