In [1]:
import os
import zipfile
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Feature Importance Analysis

After engineering features, we analyze their importance for predicting 1-day ahead demand.

## 1. Load Data

In [2]:
# Load engineered features
with zipfile.ZipFile('data/4_engineered_features.zip', 'r') as z:
    with z.open('engineered_features.csv') as f:
        df = pd.read_csv(f, low_memory=False)

df = df.sort_values('date').reset_index(drop=True)
print(f'Loaded: {len(df):,} records x {len(df.columns)} features')

Loaded: 577,404 records x 82 features


## 2. Prepare Features

In [3]:
# Prepare features and target
# Exclude columns that shouldn't be used as features
exclude_cols = ['date', 'location_id', 'product_id', 'target']

# Get all feature columns
features = [c for c in df.columns if c not in exclude_cols]

print(f'Total features available: {len(features)}')
print(f'Target: target (1-day ahead)')

# Get target
y = df['target']

print(f'\nTarget statistics:')
print(f'  target: mean={y.mean():.2f}, std={y.std():.2f}')

Total features available: 78
Target: target (1-day ahead)

Target statistics:
  target: mean=6.02, std=24.67


In [4]:
# Split data chronologically (70/15/15)
n = len(df)
train_size = int(0.7 * n)
val_size = int(0.15 * n)

X_train = df[features].iloc[:train_size].fillna(0)
y_train = y.iloc[:train_size]
X_test = df[features].iloc[train_size+val_size:].fillna(0)
y_test = y.iloc[train_size+val_size:]

print(f'Data split: {train_size:,} train / {val_size:,} val / {len(df)-train_size-val_size:,} test')
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

Data split: 404,182 train / 86,610 val / 86,612 test
Train: (404182, 78), Test: (86612, 78)


## 3. Train Random Forest

Train model to extract feature importance.

In [5]:
# Train Random Forest
print('TRAINING RANDOM FOREST FOR FEATURE IMPORTANCE')

rf = RandomForestRegressor(n_estimators=10, max_depth=5, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

importance_df = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
importance_df['cumulative'] = importance_df['importance'].cumsum()
importance_df['rank'] = range(1, len(importance_df) + 1)

print(f'\nModel Performance:')
print(f'  MAE={mae:.3f}, R²={r2:.4f}')
print(f'\nTop 10 features:')
print(importance_df.head(5)[['feature', 'importance']].to_string(index=False))

TRAINING RANDOM FOREST FOR FEATURE IMPORTANCE

Model Performance:
  MAE=2.261, R²=0.8335

Top 10 features:
           feature  importance
  units_sold_ema_7    0.943495
 units_sold_ema_14    0.020709
  units_sold_ema_3    0.006029
          epidemic    0.005322
category_Groceries    0.005033


## 4. Top Features
We select the top features for model training.

1. **Computational Efficiency**: Faster training
2. **Prevent Overfitting**: Reduce model complexity

In [6]:
# Show selected features
selected_features = importance_df.head(30)['feature'].tolist()
print('Top Selected Features:')
for i, feat in enumerate(selected_features, 1):
    imp = importance_df[importance_df['feature'] == feat]['importance'].values[0]
    print(f'{i:2d}. {feat:30s} (importance: {imp:.6f})')

Top Selected Features:
 1. units_sold_ema_7               (importance: 0.943495)
 2. units_sold_ema_14              (importance: 0.020709)
 3. units_sold_ema_3               (importance: 0.006029)
 4. epidemic                       (importance: 0.005322)
 5. category_Groceries             (importance: 0.005033)
 6. category_Home & Furniture      (importance: 0.004320)
 7. units_sold_ma_30               (importance: 0.004023)
 8. units_sold_ma_14               (importance: 0.002710)
 9. seasonality_Summer             (importance: 0.001703)
10. cpi_yoy_change                 (importance: 0.001568)
11. units_sold_ma_3                (importance: 0.001152)
12. category_Clothing              (importance: 0.001015)
13. units_sold_ema_30              (importance: 0.000658)
14. days_from_holiday              (importance: 0.000419)
15. day_of_month                   (importance: 0.000346)
16. category_Electronics           (importance: 0.000160)
17. sku_profit_pct                 (importance: 0

## 5. Save Data

In [7]:
# Load full engineered features
with zipfile.ZipFile('data/4_engineered_features.zip', 'r') as z:
    with z.open('engineered_features.csv') as f:
        df_full = pd.read_csv(f, low_memory=False)

print(f'Full dataset: {df_full.shape[0]:,} × {df_full.shape[1]} features')

# Keep essential columns + selected features + target
essential_cols = ['date', 'location_id', 'product_id', 'units_sold', 'revenue', 'profit']
target_col = ['target']

df_reduced = df_full[essential_cols + selected_features + target_col]

print(f'Original: {df_full.shape[0]:,} × {df_full.shape[1]} features')
print(f'Reduced:  {df_reduced.shape[0]:,} × {df_reduced.shape[1]} features')
print(f'  - Essential: {len(essential_cols)} columns')
print(f'  - Selected features: {len(selected_features)} columns')
print(f'  - Target: {len(target_col)} column')

# Verify target is present
assert 'target' in df_reduced.columns
print('\n[OK] Verification: Target present in reduced dataset')

# Save reduced dataset
df_reduced.to_csv('data/5_selected_features.csv', index=False)
with zipfile.ZipFile('data/5_selected_features.zip', 'w', zipfile.ZIP_DEFLATED) as z:
    z.write('data/5_selected_features.csv', 'selected_features.csv')

# Remove CSV to save space
os.remove('data/5_selected_features.csv')
print('[OK] Saved to data/5_selected_features.zip')

Full dataset: 577,404 × 82 features
Original: 577,404 × 82 features
Reduced:  577,404 × 37 features
  - Essential: 6 columns
  - Selected features: 30 columns
  - Target: 1 column

[OK] Verification: Target present in reduced dataset
[OK] Saved to data/5_selected_features.zip
