In [3]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

### Load the trained model and feature list


In [None]:
print("=" * 60)
print("🧪 TESTING BASELINE MODEL")
print("=" * 60)

### Test 1: Load saved model and features

In [None]:
print("\n1️⃣ Test: Loading saved model and features")
try:
    with open('models/xgboost_baseline.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    
    with open('models/feature_cols.pkl', 'rb') as f:
        loaded_features = pickle.load(f)
    
    print(f"   ✅ Model loaded successfully")
    print(f"   ✅ Feature list loaded: {len(loaded_features)} features")
    assert isinstance(loaded_model, XGBRegressor), "Model is not XGBRegressor"
    assert len(loaded_features) > 0, "Feature list is empty"
except Exception as e:
    print(f"   ❌ Failed to load model/features: {e}")
    raise

🧪 TESTING BASELINE MODEL

1️⃣ Test: Loading saved model and features
   ✅ Model loaded successfully
   ✅ Feature list loaded: 29 features


### Test 2: Verify model attributes

In [None]:
print("\n2️⃣ Test: Model attributes")
try:
    assert hasattr(loaded_model, 'predict'), "Model doesn't have predict method"
    assert hasattr(loaded_model, 'feature_importances_'), "Model doesn't have feature importances"
    print(f"   ✅ Model has required attributes")
    print(f"   ✅ Model has {len(loaded_model.feature_importances_)} feature importances")
except AssertionError as e:
    print(f"   ❌ {e}")
    raise


2️⃣ Test: Model attributes
   ✅ Model has required attributes
   ✅ Model has 29 feature importances


### Test 3: Load and prepare test data

In [None]:
print("\n3️⃣ Test: Loading and preparing test data")
try:
    # Load test data (assuming it's preprocessed like train/test split)
    test_data = pd.read_csv('../data/kernel/receivals.csv')
    
    # Convert date_arrival
    test_data['date_arrival'] = pd.to_datetime(test_data['date_arrival'], format='%Y-%m-%d %H:%M:%S %z', utc=True, errors='coerce')
    
    # Filter only completed (assuming same preprocessing as training)
    test_data = test_data[test_data['receival_status'] == 'Completed'].copy()
    
    print(f"   ✅ Test data loaded: {test_data.shape}")
    print(f"   ✅ Date range: {test_data['date_arrival'].min().date()} to {test_data['date_arrival'].max().date()}")
except Exception as e:
    print(f"   ❌ Failed to load test data: {e}")
    raise


3️⃣ Test: Loading and preparing test data
   ✅ Test data loaded: (122448, 10)
   ✅ Date range: 2004-06-15 to 2024-12-19


### Test 4: Feature engineering for test data (matching training pipeline)

In [None]:
print("\n4️⃣ Test: Feature engineering on test data")
try:
    # Temporal features
    test_data['year'] = test_data['date_arrival'].dt.year
    test_data['month'] = test_data['date_arrival'].dt.month
    test_data['day_of_week'] = test_data['date_arrival'].dt.dayofweek
    test_data['quarter'] = test_data['date_arrival'].dt.quarter
    test_data['is_weekend'] = test_data['day_of_week'].isin([5, 6]).astype(int)
    
    # Cyclical encoding
    test_data['month_sin'] = np.sin(2 * np.pi * test_data['month'] / 12)
    test_data['month_cos'] = np.cos(2 * np.pi * test_data['month'] / 12)
    test_data['day_sin'] = np.sin(2 * np.pi * test_data['day_of_week'] / 7)
    test_data['day_cos'] = np.cos(2 * np.pi * test_data['day_of_week'] / 7)
    
    # Days since start
    test_data['days_since_start'] = (test_data['date_arrival'] - test_data['date_arrival'].min()).dt.days
    
    # Supplier aggregations
    supplier_stats = test_data.groupby('supplier_id')['net_weight'].agg([
        ('mean', 'mean'),
        ('median', 'median'),
        ('std', 'std')
    ]).reset_index()
    supplier_stats.columns = ['supplier_id', 'supplier_mean_weight', 'supplier_median_weight', 'supplier_std_weight']
    test_data = test_data.merge(supplier_stats, on='supplier_id', how='left')
    
    # Supplier CV and total receivals
    test_data['supplier_cv'] = test_data['supplier_std_weight'] / test_data['supplier_mean_weight']
    test_data['supplier_total_receivals'] = test_data.groupby('supplier_id')['supplier_id'].transform('count')
    
    # Lag features (simplified for test)
    test_data = test_data.sort_values(['supplier_id', 'date_arrival'])
    test_data['weight_lag_1'] = test_data.groupby('supplier_id')['net_weight'].shift(1)
    test_data['weight_lag_3_mean'] = test_data.groupby('supplier_id')['net_weight'].transform(
        lambda x: x.rolling(3, min_periods=1).mean().shift(1)
    )
    test_data['weight_lag_7_mean'] = test_data.groupby('supplier_id')['net_weight'].transform(
        lambda x: x.rolling(7, min_periods=1).mean().shift(1)
    )
    
    # Supplier trend
    test_data['supplier_trend'] = test_data.groupby('supplier_id')['net_weight'].transform(
        lambda x: x.rolling(10, min_periods=2).mean() / x.mean()
    )
    
    # RM & Product features
    rm_stats = test_data.groupby('rm_id')['net_weight'].agg(['mean', 'std', 'count']).reset_index()
    rm_stats.columns = ['rm_id', 'rm_mean_weight', 'rm_std_weight', 'rm_count']
    test_data = test_data.merge(rm_stats, on='rm_id', how='left')
    
    product_stats = test_data.groupby('product_id')['net_weight'].agg(['mean', 'std', 'count']).reset_index()
    product_stats.columns = ['product_id', 'product_mean_weight', 'product_std_weight', 'product_count']
    test_data = test_data.merge(product_stats, on='product_id', how='left')
    
    # Supplier-RM frequency
    test_data['supplier_rm_frequency'] = test_data.groupby(['supplier_id', 'rm_id'])['rm_id'].transform('count')
    
    # Fill missing values
    test_data = test_data.fillna(0)
    
    print(f"   ✅ Features engineered: {test_data.shape}")
    
    # Check if all required features exist
    missing_features = [f for f in loaded_features if f not in test_data.columns]
    if missing_features:
        print(f"   ⚠️  Missing features: {missing_features}")
    else:
        print(f"   ✅ All required features present")
        
except Exception as e:
    print(f"   ❌ Feature engineering failed: {e}")
    raise


4️⃣ Test: Feature engineering on test data
   ✅ Features engineered: (122448, 36)
   ✅ All required features present


### Test 5: Make predictions

In [None]:
print("\n5️⃣ Test: Making predictions")
try:
    # Prepare feature matrix
    X_test = test_data[loaded_features]
    
    # Make predictions
    raw_predictions = loaded_model.predict(X_test)
    
    # Check for negative predictions
    num_negative = (raw_predictions < 0).sum()
    
    # Clip negative predictions to 0
    predictions = np.clip(raw_predictions, 0, None)
    
    print(f"   ✅ Predictions generated: {len(predictions)} samples")
    
    if num_negative > 0:
        print(f"   ⚠️  {num_negative} negative predictions clipped to 0 ({num_negative/len(predictions)*100:.2f}%)")
        print(f"   ⚠️  Most negative value: {raw_predictions.min():.2f} kg")
    else:
        print(f"   ✅ All predictions are positive")
    
    print(f"\n   📊 Prediction statistics (after clipping):")
    print(f"      Mean: {predictions.mean():.2f} kg")
    print(f"      Median: {np.median(predictions):.2f} kg")
    print(f"      Min: {predictions.min():.2f} kg")
    print(f"      Max: {predictions.max():.2f} kg")
    print(f"      Std: {predictions.std():.2f} kg")
    print(f"      Q1 (25%): {np.percentile(predictions, 25):.2f} kg")
    print(f"      Q3 (75%): {np.percentile(predictions, 75):.2f} kg")
    
    assert len(predictions) == len(X_test), "Prediction length mismatch"
    assert not np.isnan(predictions).any(), "Predictions contain NaN values"
    assert (predictions >= 0).all(), "Predictions contain negative values after clipping"
    
except Exception as e:
    print(f"   ❌ Prediction failed: {e}")
    raise


5️⃣ Test: Making predictions
   ✅ Predictions generated: 122448 samples
   ⚠️  9 negative predictions clipped to 0 (0.01%)
   ⚠️  Most negative value: -1429.88 kg

   📊 Prediction statistics (after clipping):
      Mean: 12886.42 kg
      Median: 12309.33 kg
      Min: 0.00 kg
      Max: 26259.20 kg
      Std: 5760.13 kg
      Q1 (25%): 8752.13 kg
      Q3 (75%): 16995.49 kg


### Test 6: Create submission file

In [None]:
print("\n6️⃣ Test: Creating submission file")
try:
    # Create submission dataframe
    submission = pd.DataFrame({
        'ID': range(1, len(predictions) + 1),
        'predicted_weight': predictions
    })
    
    # Round predictions to reasonable precision
    submission['predicted_weight'] = submission['predicted_weight'].round(2)
    
    # Save submission file
    submission.to_csv('submission.csv', index=False)
    
    print(f"   ✅ Submission file created: submission.csv")
    print(f"   ✅ Shape: {submission.shape}")
    print(f"\n   📄 First 10 rows:")
    print(submission.head(10).to_string(index=False))
    
    # Validate format
    assert list(submission.columns) == ['ID', 'predicted_weight'], "Column names don't match required format"
    assert submission['ID'].iloc[0] == 1, "ID should start from 1"
    assert submission['ID'].is_monotonic_increasing, "IDs should be sequential"
    
except Exception as e:
    print(f"   ❌ Submission file creation failed: {e}")
    raise


6️⃣ Test: Creating submission file
   ✅ Submission file created: submission.csv
   ✅ Shape: (122448, 2)

   📄 First 10 rows:
 ID  predicted_weight
  1      16997.349609
  2      22451.980469
  3      22261.490234
  4      22298.880859
  5      22298.880859
  6      22489.380859
  7      22549.189453
  8      22606.460938
  9      22606.460938
 10      22643.849609


### Test 7: Validate submission format against sample

In [None]:
print("\n7️⃣ Test: Validating submission format")
try:
    sample_submission = pd.read_csv('../data/sample_submission.csv')
    
    print(f"   ℹ️  Sample submission shape: {sample_submission.shape}")
    print(f"   ℹ️  Generated submission shape: {submission.shape}")
    
    # Check column names match
    assert list(submission.columns) == list(sample_submission.columns), "Column names don't match sample"
    
    print(f"   ✅ Submission format matches sample_submission.csv")
    
except Exception as e:
    print(f"   ⚠️  Could not validate against sample: {e}")


7️⃣ Test: Validating submission format
   ℹ️  Sample submission shape: (30450, 2)
   ℹ️  Generated submission shape: (122448, 2)
   ✅ Submission format matches sample_submission.csv


### Test 8: Model performance check (if ground truth available)

In [None]:
print("\n8️⃣ Test: Model performance check")
try:
    if 'net_weight' in test_data.columns:
        from sklearn.metrics import mean_absolute_error, r2_score
        

        y_true = test_data['net_weight'].values
        mae = mean_absolute_error(y_true, predictions)
        r2 = r2_score(y_true, predictions)
        relative_error = (mae / y_true.mean()) * 100
        
        print(f"   📊 Performance on test data:")
        print(f"      MAE: {mae:.2f} kg")
        print(f"      R²: {r2:.4f}")
        print(f"      Relative Error: {relative_error:.2f}%")
        
        # Performance threshold checks
        if relative_error < 10:
            print(f"   ✅ Model performs well (error < 10%)")
        elif relative_error < 20:
            print(f"   ⚠️  Model performance acceptable (error < 20%)")
        else:
            print(f"   ❌ Model performance needs improvement (error >= 20%)")
    else:
        print(f"   ℹ️  Ground truth not available, skipping performance check")
        
except Exception as e:
    print(f"   ⚠️  Performance check failed: {e}")


8️⃣ Test: Model performance check
   📊 Performance on test data:
      MAE: 4067.91 kg
      R²: 0.5992
      Relative Error: 31.35%
   ❌ Model performance needs improvement (error >= 20%)


In [12]:
print("\n" + "=" * 60)
print("✅ ALL TESTS COMPLETED SUCCESSFULLY")
print("=" * 60)
print(f"\n📁 Output file: submission.csv")
print(f"📊 Predictions: {len(predictions)} rows")
print(f"🎯 Ready for submission!")


✅ ALL TESTS COMPLETED SUCCESSFULLY

📁 Output file: submission.csv
📊 Predictions: 122448 rows
🎯 Ready for submission!
