# 🎯 Generate Kaggle Submission File

This notebook generates predictions for the Kaggle competition using the trained XGBoost model and the prediction_mapping.csv file.

In [1]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print("🚀 GENERATING KAGGLE SUBMISSION")
print("=" * 70)

🚀 GENERATING KAGGLE SUBMISSION


## 1. Load Trained Model and Features

In [2]:
print("\n📦 Loading trained model and feature list...")

# Load the trained XGBoost model
with open('models/xgboost_baseline.pkl', 'rb') as f:
    model = pickle.load(f)

# Load feature columns
with open('models/feature_cols.pkl', 'rb') as f:
    feature_cols = pickle.load(f)

print(f"✅ Model loaded: {type(model).__name__}")
print(f"✅ Number of features: {len(feature_cols)}")
print(f"\n📋 Features: {feature_cols}")


📦 Loading trained model and feature list...
✅ Model loaded: XGBRegressor
✅ Number of features: 29

📋 Features: ['supplier_id', 'rm_id', 'product_id', 'year', 'month', 'day_of_week', 'quarter', 'is_weekend', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'days_since_start', 'supplier_mean_weight', 'supplier_median_weight', 'supplier_std_weight', 'supplier_cv', 'supplier_total_receivals', 'weight_lag_1', 'weight_lag_3_mean', 'weight_lag_7_mean', 'supplier_trend', 'rm_mean_weight', 'rm_std_weight', 'rm_count', 'product_mean_weight', 'product_std_weight', 'product_count', 'supplier_rm_frequency']


## 2. Load Historical Training Data

We need historical data to compute statistics and features for prediction.

In [3]:
print("\n" + "=" * 70)
print("📊 Loading historical data...")
print("=" * 70)

# Load historical receivals data
receivals = pd.read_csv('../data/kernel/receivals.csv')
print(f"✅ Receivals data: {receivals.shape}")

# Convert date to datetime
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], format='%Y-%m-%d %H:%M:%S %z', utc=True, errors='coerce')

# Filter only completed receivals (same as training)
receivals = receivals[receivals['receival_status'] == 'Completed'].copy()
print(f"✅ Completed receivals: {receivals.shape}")
print(f"📅 Historical range: {receivals['date_arrival'].min().date()} to {receivals['date_arrival'].max().date()}")


📊 Loading historical data...
✅ Receivals data: (122590, 10)
✅ Completed receivals: (122448, 10)
📅 Historical range: 2004-06-15 to 2024-12-19


## 3. Load Prediction Mapping

This file contains the combinations of rm_id and date ranges we need to predict.

In [4]:
print("\n" + "=" * 70)
print("🎯 Loading prediction mapping...")
print("=" * 70)

# Load prediction mapping
pred_mapping = pd.read_csv('../data/prediction_mapping.csv')
print(f"✅ Prediction mapping: {pred_mapping.shape}")
print(f"\n📄 First few rows:")
print(pred_mapping.head(10))

# Convert dates
pred_mapping['forecast_start_date'] = pd.to_datetime(pred_mapping['forecast_start_date'])
pred_mapping['forecast_end_date'] = pd.to_datetime(pred_mapping['forecast_end_date'])

print(f"\n📅 Forecast range: {pred_mapping['forecast_start_date'].min().date()} to {pred_mapping['forecast_end_date'].max().date()}")
print(f"🔢 Unique rm_ids: {pred_mapping['rm_id'].nunique()}")
print(f"✅ Total predictions needed: {len(pred_mapping)}")


🎯 Loading prediction mapping...
✅ Prediction mapping: (30450, 4)

📄 First few rows:
   ID  rm_id forecast_start_date forecast_end_date
0   1    365          2025-01-01        2025-01-02
1   2    365          2025-01-01        2025-01-03
2   3    365          2025-01-01        2025-01-04
3   4    365          2025-01-01        2025-01-05
4   5    365          2025-01-01        2025-01-06
5   6    365          2025-01-01        2025-01-07
6   7    365          2025-01-01        2025-01-08
7   8    365          2025-01-01        2025-01-09
8   9    365          2025-01-01        2025-01-10
9  10    365          2025-01-01        2025-01-11

📅 Forecast range: 2025-01-01 to 2025-05-31
🔢 Unique rm_ids: 203
✅ Total predictions needed: 30450


## 4. Compute Historical Statistics

Calculate statistics from historical data that will be used as features.

In [5]:
print("\n" + "=" * 70)
print("📈 Computing historical statistics...")
print("=" * 70)

# Supplier statistics
supplier_stats = receivals.groupby('supplier_id')['net_weight'].agg([
    ('mean', 'mean'),
    ('median', 'median'),
    ('std', 'std'),
    ('count', 'count')
]).reset_index()
supplier_stats.columns = ['supplier_id', 'supplier_mean_weight', 'supplier_median_weight', 'supplier_std_weight', 'supplier_total_receivals']
supplier_stats['supplier_cv'] = supplier_stats['supplier_std_weight'] / supplier_stats['supplier_mean_weight']
supplier_stats = supplier_stats.fillna(0)

print(f"✅ Supplier statistics computed: {len(supplier_stats)} suppliers")

# RM statistics
rm_stats = receivals.groupby('rm_id')['net_weight'].agg(['mean', 'std', 'count']).reset_index()
rm_stats.columns = ['rm_id', 'rm_mean_weight', 'rm_std_weight', 'rm_count']
rm_stats = rm_stats.fillna(0)

print(f"✅ RM statistics computed: {len(rm_stats)} materials")

# Product statistics
product_stats = receivals.groupby('product_id')['net_weight'].agg(['mean', 'std', 'count']).reset_index()
product_stats.columns = ['product_id', 'product_mean_weight', 'product_std_weight', 'product_count']
product_stats = product_stats.fillna(0)

print(f"✅ Product statistics computed: {len(product_stats)} products")

# Supplier-RM frequency
supplier_rm_freq = receivals.groupby(['supplier_id', 'rm_id']).size().reset_index(name='supplier_rm_frequency')

print(f"✅ Supplier-RM frequencies computed")


📈 Computing historical statistics...
✅ Supplier statistics computed: 255 suppliers
✅ RM statistics computed: 203 materials
✅ Product statistics computed: 54 products
✅ Supplier-RM frequencies computed


## 5. Get Most Recent Supplier for Each RM

For prediction, we need to associate each rm_id with a supplier. We'll use the most recent supplier for each RM.

In [6]:
print("\n" + "=" * 70)
print("🔗 Mapping RM to most recent suppliers...")
print("=" * 70)

# Sort by date and get most recent supplier for each rm_id
receivals_sorted = receivals.sort_values(['rm_id', 'date_arrival'], ascending=[True, False])
rm_to_supplier = receivals_sorted.groupby('rm_id').first()[['supplier_id', 'product_id']].reset_index()

print(f"✅ Mapped {len(rm_to_supplier)} materials to suppliers")
print(f"\n📄 Sample mappings:")
print(rm_to_supplier.head(10))


🔗 Mapping RM to most recent suppliers...
✅ Mapped 203 materials to suppliers

📄 Sample mappings:
   rm_id  supplier_id  product_id
0  342.0        52433  91900170.0
1  343.0        54748  91900143.0
2  345.0        50387  91900143.0
3  346.0        20023  91900146.0
4  347.0        52064  91900143.0
5  348.0        50387  91900143.0
6  353.0        50390  91900143.0
7  354.0        54764  91900182.0
8  355.0        10001  91900152.0
9  357.0        10001  91900152.0


## 6. Prepare Prediction Dataset

Create features for each prediction row using the same logic as training.

In [7]:
print("\n" + "=" * 70)
print("🔧 Preparing prediction dataset...")
print("=" * 70)

# Merge prediction mapping with supplier info
pred_data = pred_mapping.merge(rm_to_supplier, on='rm_id', how='left')

# For missing suppliers, use the most common supplier for that RM or a default
missing_supplier_mask = pred_data['supplier_id'].isna()
if missing_supplier_mask.sum() > 0:
    print(f"⚠️  {missing_supplier_mask.sum()} rows without supplier mapping")
    # Use mode of supplier_id or default to first supplier
    default_supplier = receivals['supplier_id'].mode()[0] if len(receivals) > 0 else 1
    pred_data.loc[missing_supplier_mask, 'supplier_id'] = default_supplier
    default_product = receivals['product_id'].mode()[0] if len(receivals) > 0 else 1
    pred_data.loc[missing_supplier_mask, 'product_id'] = default_product
    print(f"   → Filled with default supplier: {default_supplier}")

# Use forecast_end_date as the target date for prediction
pred_data['date_arrival'] = pred_data['forecast_end_date']

print(f"✅ Base prediction data: {pred_data.shape}")
print(f"\n📄 Sample:")
print(pred_data.head())


🔧 Preparing prediction dataset...
✅ Base prediction data: (30450, 7)

📄 Sample:
   ID  rm_id forecast_start_date forecast_end_date  supplier_id  product_id  \
0   1    365          2025-01-01        2025-01-02        50387  91900143.0   
1   2    365          2025-01-01        2025-01-03        50387  91900143.0   
2   3    365          2025-01-01        2025-01-04        50387  91900143.0   
3   4    365          2025-01-01        2025-01-05        50387  91900143.0   
4   5    365          2025-01-01        2025-01-06        50387  91900143.0   

  date_arrival  
0   2025-01-02  
1   2025-01-03  
2   2025-01-04  
3   2025-01-05  
4   2025-01-06  


## 7. Engineer Features for Prediction

Create all the same features used during training.

In [11]:
print("\n" + "=" * 70)
print("⚙️  Engineering features...")
print("=" * 70)

# Temporal features
pred_data['year'] = pred_data['date_arrival'].dt.year
pred_data['month'] = pred_data['date_arrival'].dt.month
pred_data['day_of_week'] = pred_data['date_arrival'].dt.dayofweek
pred_data['quarter'] = pred_data['date_arrival'].dt.quarter
pred_data['is_weekend'] = pred_data['day_of_week'].isin([5, 6]).astype(int)

# Cyclical encoding
pred_data['month_sin'] = np.sin(2 * np.pi * pred_data['month'] / 12)
pred_data['month_cos'] = np.cos(2 * np.pi * pred_data['month'] / 12)
pred_data['day_sin'] = np.sin(2 * np.pi * pred_data['day_of_week'] / 7)
pred_data['day_cos'] = np.cos(2 * np.pi * pred_data['day_of_week'] / 7)

# Days since start (use historical start as reference)
historical_start = receivals['date_arrival'].min()
# Localize prediction dates to UTC to match historical data timezone
pred_data_arrival_utc = pd.to_datetime(pred_data['date_arrival']).dt.tz_localize('UTC')
pred_data['days_since_start'] = (pred_data_arrival_utc - historical_start).dt.days

print(f"✅ Temporal features created")

# Merge with historical statistics
pred_data = pred_data.merge(supplier_stats, on='supplier_id', how='left')
pred_data = pred_data.merge(rm_stats, on='rm_id', how='left')
pred_data = pred_data.merge(product_stats, on='product_id', how='left')
pred_data = pred_data.merge(supplier_rm_freq, on=['supplier_id', 'rm_id'], how='left')

print(f"✅ Historical statistics merged")

# Lag features - use historical averages as proxies since we don't have real lag data for future
# For each supplier, get recent average weights
recent_window = 30  # last 30 days
cutoff_date = receivals['date_arrival'].max() - pd.Timedelta(days=recent_window)
recent_data = receivals[receivals['date_arrival'] > cutoff_date]

# Recent averages by supplier
recent_supplier_avg = recent_data.groupby('supplier_id')['net_weight'].mean().reset_index()
recent_supplier_avg.columns = ['supplier_id', 'recent_avg_weight']
pred_data = pred_data.merge(recent_supplier_avg, on='supplier_id', how='left')

# Use recent average as proxy for lag features
pred_data['weight_lag_1'] = pred_data['recent_avg_weight_x']
pred_data['weight_lag_3_mean'] = pred_data['recent_avg_weight_x']
pred_data['weight_lag_7_mean'] = pred_data['recent_avg_weight_x']

# Supplier trend - assume stable (1.0) for future predictions
pred_data['supplier_trend'] = 1.0

print(f"✅ Lag and trend features estimated")

# Fill any remaining NaN with 0
pred_data = pred_data.fillna(0)

print(f"✅ All features ready: {pred_data.shape}")
print(f"\n📊 Feature summary:")
print(pred_data[feature_cols].describe())


⚙️  Engineering features...
✅ Temporal features created
✅ Historical statistics merged
✅ Lag and trend features estimated
✅ All features ready: (30450, 60)

📊 Feature summary:
        supplier_id         rm_id    product_id     year         month  \
count  30450.000000  30450.000000  3.045000e+04  30450.0  30450.000000   
mean   56414.142857   2345.359606  8.148843e+07   2025.0      3.026667   
std    18093.144406   1124.596786  2.912886e+07      0.0      1.418692   
min    10001.000000    342.000000  1.002000e+03   2025.0      1.000000   
25%    50420.000000   1875.000000  9.190015e+07   2025.0      2.000000   
50%    55251.000000   2159.000000  9.190030e+07   2025.0      3.000000   
75%    69879.000000   3142.000000  9.190120e+07   2025.0      4.000000   
max    89313.000000   4501.000000  9.190209e+07   2025.0      5.000000   

        day_of_week       quarter    is_weekend     month_sin     month_cos  \
count  30450.000000  30450.000000  30450.000000  30450.000000  3.045000e+04  

## 8. Make Predictions

In [12]:
print("\n" + "=" * 70)
print("🔮 Making predictions...")
print("=" * 70)

# Ensure all required features are present
missing_features = [f for f in feature_cols if f not in pred_data.columns]
if missing_features:
    print(f"❌ Missing features: {missing_features}")
    raise ValueError(f"Cannot proceed: missing features {missing_features}")

# Prepare feature matrix
X_pred = pred_data[feature_cols]

print(f"✅ Feature matrix prepared: {X_pred.shape}")
print(f"📊 Feature matrix info:")
print(f"   - Total features: {X_pred.shape[1]}")
print(f"   - Total samples: {X_pred.shape[0]}")
print(f"   - Missing values: {X_pred.isna().sum().sum()}")

# Make predictions
predictions = model.predict(X_pred)

# Clip negative predictions to 0
num_negative = (predictions < 0).sum()
predictions = np.clip(predictions, 0, None)

print(f"\n✅ Predictions generated: {len(predictions)} values")

if num_negative > 0:
    print(f"⚠️  Clipped {num_negative} negative predictions to 0 ({num_negative/len(predictions)*100:.2f}%)")

print(f"\n📊 Prediction statistics:")
print(f"   Mean:     {predictions.mean():.2f} kg")
print(f"   Median:   {np.median(predictions):.2f} kg")
print(f"   Std Dev:  {predictions.std():.2f} kg")
print(f"   Min:      {predictions.min():.2f} kg")
print(f"   Max:      {predictions.max():.2f} kg")
print(f"   Q1 (25%): {np.percentile(predictions, 25):.2f} kg")
print(f"   Q3 (75%): {np.percentile(predictions, 75):.2f} kg")


🔮 Making predictions...
✅ Feature matrix prepared: (30450, 29)
📊 Feature matrix info:
   - Total features: 29
   - Total samples: 30450
   - Missing values: 0

✅ Predictions generated: 30450 values
⚠️  Clipped 188 negative predictions to 0 (0.62%)

📊 Prediction statistics:
   Mean:     12482.85 kg
   Median:   13961.54 kg
   Std Dev:  6208.74 kg
   Min:      0.00 kg
   Max:      24913.39 kg
   Q1 (25%): 6893.93 kg
   Q3 (75%): 17869.76 kg


## 9. Create Submission File

In [13]:
print("\n" + "=" * 70)
print("📝 Creating submission file...")
print("=" * 70)

# Create submission dataframe
submission = pd.DataFrame({
    'ID': pred_mapping['ID'],
    'predicted_weight': predictions
})

# Round to 2 decimal places
submission['predicted_weight'] = submission['predicted_weight'].round(2)

# Verify format
print(f"\n✅ Submission dataframe created: {submission.shape}")
print(f"\n📄 First 20 rows:")
print(submission.head(20))

# Validate
assert list(submission.columns) == ['ID', 'predicted_weight'], "❌ Column names incorrect"
assert len(submission) == 30450, f"❌ Wrong number of rows: {len(submission)} (expected 30450)"
assert submission['ID'].iloc[0] == 1, "❌ ID should start from 1"
assert submission['ID'].iloc[-1] == 30450, f"❌ Last ID should be 30450, got {submission['ID'].iloc[-1]}"
assert submission['ID'].is_monotonic_increasing, "❌ IDs should be sequential"
assert not submission['predicted_weight'].isna().any(), "❌ Predictions contain NaN"

print(f"\n✅ All validations passed!")


📝 Creating submission file...

✅ Submission dataframe created: (30450, 2)

📄 First 20 rows:
    ID  predicted_weight
0    1      16930.419922
1    2      17749.080078
2    3      17360.759766
3    4      14337.480469
4    5      17600.640625
5    6      17329.759766
6    7      17329.759766
7    8      16930.419922
8    9      17749.080078
9   10      17360.759766
10  11      14337.480469
11  12      17600.640625
12  13      17329.759766
13  14      17329.759766
14  15      16930.419922
15  16      17749.080078
16  17      17360.759766
17  18      14337.480469
18  19      17600.640625
19  20      17329.759766

✅ All validations passed!


## 10. Save Submission File

In [14]:
print("\n" + "=" * 70)
print("💾 Saving submission file...")
print("=" * 70)

# Save to CSV
output_path = 'submission.csv'
submission.to_csv(output_path, index=False)

print(f"✅ Submission saved to: {output_path}")

# Verify file
import os
file_size = os.path.getsize(output_path)
print(f"✅ File size: {file_size:,} bytes ({file_size/1024:.2f} KB)")

# Count lines in file
with open(output_path, 'r') as f:
    num_lines = sum(1 for _ in f)

print(f"✅ Number of lines in file: {num_lines:,} (including header)")

if num_lines == 30451:  # 30450 predictions + 1 header
    print(f"✅ PERFECT! File has exactly 30451 lines (30450 predictions + header)")
else:
    print(f"⚠️  WARNING: Expected 30451 lines, got {num_lines}")

# Load sample submission and compare format
print(f"\n📋 Comparing with sample_submission.csv...")
sample = pd.read_csv('../data/sample_submission.csv')
print(f"   Sample shape: {sample.shape}")
print(f"   Our shape:    {submission.shape}")

if sample.shape == submission.shape:
    print(f"✅ Shapes match perfectly!")
else:
    print(f"⚠️  Shape mismatch!")

print("\n" + "=" * 70)
print("🎉 SUBMISSION FILE READY FOR KAGGLE!")
print("=" * 70)
print(f"\n📁 File: {output_path}")
print(f"📊 Rows: {len(submission):,} predictions")
print(f"✅ Format: Validated")
print(f"🚀 Status: Ready to upload!")


💾 Saving submission file...
✅ Submission saved to: submission.csv
✅ File size: 430,144 bytes (420.06 KB)
✅ Number of lines in file: 30,451 (including header)
✅ PERFECT! File has exactly 30451 lines (30450 predictions + header)

📋 Comparing with sample_submission.csv...
   Sample shape: (30450, 2)
   Our shape:    (30450, 2)
✅ Shapes match perfectly!

🎉 SUBMISSION FILE READY FOR KAGGLE!

📁 File: submission.csv
📊 Rows: 30,450 predictions
✅ Format: Validated
🚀 Status: Ready to upload!


## 11. Summary Statistics

In [None]:
print("\n" + "=" * 70)
print("📊 FINAL SUMMARY")
print("=" * 70)

print(f"\n🎯 Prediction Distribution by RM:")
rm_pred_summary = pred_data[['rm_id']].copy()
rm_pred_summary['predicted_weight'] = predictions
rm_summary = rm_pred_summary.groupby('rm_id')['predicted_weight'].agg(['count', 'mean', 'std', 'min', 'max'])
rm_summary = rm_summary.sort_values('count', ascending=False)
print(rm_summary.head(10))

print(f"\n📅 Prediction Distribution by Month:")
pred_data['pred_month'] = pred_data['date_arrival'].dt.to_period('M')
monthly_summary = pred_data.groupby('pred_month').size()
print(monthly_summary)

print(f"\n✅ Submission generation complete!")