In [151]:
try:
    # Standard libraries
    import os
    import sys
    import json
    import pickle
    import warnings
    from pathlib import Path
    from datetime import datetime, timedelta

    # Data manipulation
    import numpy as np
    import pandas as pd

    # Machine learning
    from xgboost import XGBRegressor
    from sklearn.model_selection import TimeSeriesSplit
    from sklearn.preprocessing import LabelEncoder, StandardScaler
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.isotonic import IsotonicRegression

    # Suppress warnings
    warnings.filterwarnings('ignore')
    pd.set_option('display.max_columns', None)

    print("✅ All required libraries imported successfully!")
except ImportError as e:
    print(f"❌ Error importing libraries: {str(e)}")
    print("\nPlease install missing packages using:")
    print("pip install pandas numpy scikit-learn xgboost")

✅ All required libraries imported successfully!


# Bid Recommendation System

This notebook implements a machine learning model for bid recommendations with the following features:

1. Data Loading & Validation
   - Automatic CSV file detection
   - Data structure validation
   - Date parsing and sorting
   - Basic statistics calculation

2. Feature Engineering
   - Categorical feature encoding
   - Time-based features
   - Rolling statistics

3. Model Training
   - XGBoost Regressor
   - Time-series cross-validation
   - Feature importance analysis

4. Model Evaluation
   - RMSE, MAE, R² metrics
   - Error distribution analysis
   - Prediction calibration

5. Deployment Preparation
   - Model artifact saving
   - Encoder persistence
   - Metadata tracking

## Requirements

The dataset should be a CSV file with the following columns:
- `BidDate`: Date of the bid
- `BidAmount`: Numerical bid amount
- `WinStatus`: Binary indicator (0/1) for bid success

Place your CSV file in the same directory as this notebook.

# Bid Recommendation Model

This notebook implements a machine learning model for bid recommendations. 

## Setup Requirements
1. Python 3.7+
2. Required packages: pandas, numpy, scikit-learn, xgboost
3. Data file: 'bid_data.csv' in the same directory

## Instructions
1. Run cells in order
2. Make sure bid_data.csv contains these columns:
   - BidDate (datetime)
   - BidAmount (numeric)
   - WinStatus (1/0)
   - ProjectType (categorical)
   - Location (categorical)
   - ClientType (categorical)

## Important Notes
- The model uses time-based cross-validation
- All amounts are in USD
- Missing values will be handled automatically
- Models and artifacts will be saved in ./models directory

In [152]:
# Standard libraries
import os
import sys
import json
import pickle
import warnings
from pathlib import Path
from datetime import datetime, timedelta

# Data manipulation
import numpy as np
import pandas as pd

# Machine learning
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.isotonic import IsotonicRegression

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

Libraries imported successfully!


In [153]:
# Create sample data for testing
def create_sample_data(n_samples=1000):
    """Create a sample dataset for testing the bid recommendation system"""
    try:
        print("Creating sample dataset...")
        
        # Set random seed for reproducibility
        np.random.seed(42)
        
        # Generate dates
        start_date = pd.Timestamp('2024-01-01')
        dates = pd.date_range(start=start_date, periods=n_samples, freq='D')
        
        # Create project types
        project_types = ['Commercial', 'Residential', 'Industrial', 'Infrastructure']
        locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
        client_types = ['Government', 'Private', 'Corporate', 'Non-Profit']
        
        # Generate sample data
        df = pd.DataFrame({
            'BidDate': dates,
            'ProjectType': np.random.choice(project_types, size=n_samples),
            'Location': np.random.choice(locations, size=n_samples),
            'ClientType': np.random.choice(client_types, size=n_samples),
            'BidAmount': np.random.lognormal(mean=11, sigma=1, size=n_samples),  # Log-normal for realistic bid amounts
            'EstimatedCost': np.random.lognormal(mean=10.8, sigma=1, size=n_samples),
            'CompetitorCount': np.random.randint(1, 8, size=n_samples)
        })
        
        # Generate win status based on a simple model
        prob_win = 1 / (1 + np.exp(-(
            -0.3  # Base rate
            + 0.2 * (df['BidAmount'] < df['EstimatedCost']).astype(int)  # Lower bids more likely to win
            - 0.1 * df['CompetitorCount']  # More competitors reduce win chance
            + np.random.normal(0, 0.5, n_samples)  # Random noise
        )))
        df['WinStatus'] = (np.random.random(n_samples) < prob_win).astype(int)
        
        # Save to CSV
        csv_path = 'sample_bid_data.csv'
        df.to_csv(csv_path, index=False)
        print(f"\nSample data saved to {csv_path}")
        
        # Print statistics
        print("\nDataset Statistics:")
        print(f"Number of records: {len(df):,}")
        print(f"Date range: {df['BidDate'].min()} to {df['BidDate'].max()}")
        print(f"Average bid amount: ${df['BidAmount'].mean():,.2f}")
        print(f"Win rate: {(df['WinStatus'].mean() * 100):.1f}%")
        
        return df
        
    except Exception as e:
        print(f"Error creating sample data: {str(e)}")
        raise

# Create sample dataset
try:
    df = create_sample_data(n_samples=1000)
    print("\nSample data created successfully!")
except Exception as e:
    print(f"Failed to create sample data: {str(e)}")
    raise

Creating sample dataset...

Sample data saved to sample_bid_data.csv

Dataset Statistics:
Number of records: 1,000

Sample data saved to sample_bid_data.csv

Dataset Statistics:
Number of records: 1,000
Date range: 2024-01-01 00:00:00 to 2026-09-26 00:00:00
Average bid amount: $99,880.61
Win rate: 36.9%

Sample data created successfully!
Date range: 2024-01-01 00:00:00 to 2026-09-26 00:00:00
Average bid amount: $99,880.61
Win rate: 36.9%

Sample data created successfully!


In [154]:
# Configuration and Path Setup
def setup_project():
    """Setup project configuration and ensure all prerequisites are met"""
    try:
        print("Setting up project configuration...")
        
        # 1. Create configuration dictionary
        config = {
            'data': {
                'required_columns': ['BidDate', 'BidAmount', 'WinStatus'],
                'optional_columns': ['ProjectType', 'Location', 'ClientType', 'EstimatedCost', 'CompetitorCount']
            },
            'model': {
                'random_state': 42,
                'cv_folds': 5,
                'test_size': 0.2
            },
            'paths': {}
        }
        
        # 2. Setup directory structure
        base_dir = Path().absolute()
        directories = {
            'models': base_dir / 'models',
            'data': base_dir / 'data',
            'config': base_dir / 'config',
            'deploy': base_dir / 'deploy'
        }
        
        # Create directories
        for name, path in directories.items():
            path.mkdir(parents=True, exist_ok=True)
            config['paths'][name] = path
            print(f"Created directory: {path}")
        
        # 3. Define file paths
        config['paths'].update({
            'model_file': directories['models'] / 'xgb_regressor.pkl',
            'encoders_file': directories['models'] / 'encoders.pkl',
            'metadata_file': directories['config'] / 'metadata.json',
            'predictions_file': directories['deploy'] / 'predictions.csv'
        })
        
        # 4. Check for sample data
        if not (base_dir / 'sample_bid_data.csv').exists():
            print("\nNo sample data found. You can create it using:")
            print("df = create_sample_data(n_samples=1000)")
        
        # 5. Save configuration
        config_file = directories['config'] / 'config.json'
        with open(config_file, 'w') as f:
            json.dump(config, f, indent=4, default=str)
        
        print("\nConfiguration saved to:", config_file)
        return config
    
    except Exception as e:
        print(f"Error setting up project: {str(e)}")
        raise

# Setup project configuration
try:
    config = setup_project()
    print("\nProject setup completed successfully!")
except Exception as e:
    print(f"Failed to setup project: {str(e)}")
    raise

Setting up project configuration...
Created directory: c:\Users\prash\bid--recommendation\GSS Bid Models\models
Created directory: c:\Users\prash\bid--recommendation\GSS Bid Models\data
Created directory: c:\Users\prash\bid--recommendation\GSS Bid Models\config
Created directory: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy

Configuration saved to: c:\Users\prash\bid--recommendation\GSS Bid Models\config\config.json

Project setup completed successfully!


# Bid Fee Prediction Model

This notebook implements a machine learning model to predict bid fees based on historical data.

## Features Used
- Property characteristics (Type, Location, Size)
- Market indicators (Population, Income, Business stats)
- Historical bid patterns
- ZIP code patterns

## Model Components
1. Bid Fee Regressor: Predicts optimal bid amount
2. Win Probability Classifier: Estimates chance of winning (if data available)

## Key Requirements
- Input data must include: ZipCode, BidFee (for training)
- Optional but valuable: PropertyType, Market metrics, WinProbability

## Usage
1. Run cells in order
2. Models will be saved to `models/` directory
3. Use `BidPrediction` class for inference

In [158]:
# Data Loading and Validation
def load_and_validate_data():
    """Load and validate bid data"""
    try:
        # First check current directory for bid data files
        data_files = [f for f in os.listdir() if f.endswith('.csv')]
        
        if not data_files:
            raise FileNotFoundError("No CSV files found in the current directory")
            
        print("Available data files:")
        for i, file in enumerate(data_files, 1):
            print(f"{i}. {file}")
            
        # If multiple files found, use the first one
        csv_path = data_files[0]
        print(f"\nLoading data from: {csv_path}")
        
        # Load data with error handling
        try:
            df = pd.read_csv(csv_path)
        except pd.errors.EmptyDataError:
            raise ValueError(f"The file {csv_path} is empty")
        except pd.errors.ParserError:
            raise ValueError(f"Error parsing {csv_path}. Make sure it's a valid CSV file")
            
        # Display initial info
        print("\nInitial dataframe info:")
        print(df.info())
        
        # Check columns
        expected_cols = ['BidDate', 'BidAmount', 'WinStatus']
        missing_cols = [col for col in expected_cols if col not in df.columns]
        
        if missing_cols:
            print("\nWarning: Missing expected columns:", missing_cols)
            print("Available columns:", df.columns.tolist())
            
            # Try to identify similar column names
            all_cols = df.columns.str.lower()
            for missing in missing_cols:
                similar = [col for col in df.columns if missing.lower() in col.lower()]
                if similar:
                    print(f"Possible matches for {missing}:", similar)
            
            raise ValueError(f"Required columns missing: {missing_cols}")
        
        # Convert date column
        try:
            df['BidDate'] = pd.to_datetime(df['BidDate'])
        except Exception as e:
            print("Error converting BidDate column:", str(e))
            print("Sample values from BidDate column:", df['BidDate'].head())
            raise ValueError("Could not parse BidDate column")
        
        # Sort by date
        df = df.sort_values('BidDate').reset_index(drop=True)
        
        # Check for missing values
        missing = df[expected_cols].isnull().sum()
        if missing.any():
            print("\nWarning: Missing values detected:")
            print(missing[missing > 0])
            
        # Basic data validation
        if len(df) == 0:
            raise ValueError("DataFrame is empty after loading")
            
        # Print basic statistics
        print("\nDataset Statistics:")
        print(f"Number of records: {len(df):,}")
        print(f"Date range: {df['BidDate'].min()} to {df['BidDate'].max()}")
        print(f"Average bid amount: ${df['BidAmount'].mean():,.2f}")
        print(f"Win rate: {(df['WinStatus'].mean() * 100):.1f}%")
        
        # Save the path for future reference

        csv_path = os.path.abspath(csv_path)
        
        return df
    
    except Exception as e:
        print(f"\nError in data loading: {str(e)}")
        print("\nPlease ensure:")
        print("1. Your CSV file is in the current directory")
        print("2. The file contains the required columns: BidDate, BidAmount, WinStatus")
        print("3. The BidDate column contains valid dates")
        print("4. The BidAmount column contains numeric values")
        print("5. The WinStatus column contains binary values (0/1)")
        raise

# Load and validate the data
try:
    df = load_and_validate_data()
    print("\nData loaded and validated successfully!")
except Exception as e:
    print(f"\nFailed to load data: {str(e)}")
    print("\nPlease fix the data issues and try again.")
    raise

Available data files:
1. sample_bid_data.csv

Loading data from: sample_bid_data.csv

Initial dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   BidDate          1000 non-null   object 
 1   ProjectType      1000 non-null   object 
 2   Location         1000 non-null   object 
 3   ClientType       1000 non-null   object 
 4   BidAmount        1000 non-null   float64
 5   EstimatedCost    1000 non-null   float64
 6   CompetitorCount  1000 non-null   int64  
 7   WinStatus        1000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 62.6+ KB
None

Dataset Statistics:
Number of records: 1,000
Date range: 2024-01-01 00:00:00 to 2026-09-26 00:00:00
Average bid amount: $99,880.61
Win rate: 36.9%

Data loaded and validated successfully!


In [159]:
# Feature Engineering - Part 1: Categorical Features
def process_categorical_features(df, categorical_cols=None):
    """Process categorical features with label encoding"""
    try:
        if categorical_cols is None:
            # Identify categorical columns (customize this list based on your data)
            categorical_cols = ['ProjectType', 'Location', 'ClientType']
            
        encoders = {}
        df_encoded = df.copy()
        
        for col in categorical_cols:
            if col in df.columns:
                print(f"\nProcessing {col}:")
                # Print value counts
                print(df[col].value_counts().head())
                
                # Create and fit encoder
                encoders[col] = LabelEncoder()
                df_encoded[col] = encoders[col].fit_transform(df[col].fillna('MISSING'))
                
                # Print mapping for verification
                unique_values = df[col].unique()
                encoded_values = encoders[col].transform(unique_values)
                mapping = dict(zip(unique_values, encoded_values))
                print(f"\n{col} Mapping (first 5):")
                for k, v in list(mapping.items())[:5]:
                    print(f"{k} -> {v}")
                    
        return df_encoded, encoders, categorical_cols
    
    except Exception as e:
        print(f"Error processing categorical features: {str(e)}")
        raise

# Process categorical features
try:
    df_encoded, encoders, categorical_cols = process_categorical_features(df)
    print("\nCategorical features processed successfully!")
except Exception as e:
    print(f"Failed to process categorical features: {str(e)}")
    raise


Processing ProjectType:
ProjectType
Infrastructure    280
Commercial        258
Industrial        232
Residential       230
Name: count, dtype: int64

ProjectType Mapping (first 5):
Industrial -> 1
Infrastructure -> 2
Commercial -> 0
Residential -> 3

Processing Location:
Location
New York       213
Phoenix        209
Chicago        205
Los Angeles    202
Houston        171
Name: count, dtype: int64

Location Mapping (first 5):
New York -> 3
Phoenix -> 4
Houston -> 1
Chicago -> 0
Los Angeles -> 2

Processing ClientType:
ClientType
Private       262
Government    258
Non-Profit    251
Corporate     229
Name: count, dtype: int64

ClientType Mapping (first 5):
Private -> 3
Corporate -> 0
Non-Profit -> 2
Government -> 1

Categorical features processed successfully!


In [160]:
# Feature Engineering - Part 2: Time-based Features
def create_time_features(df):
    """Create time-based features and rolling statistics"""
    try:
        df_time = df.copy()
        
        # Basic time features
        df_time['Year'] = df_time['BidDate'].dt.year
        df_time['Month'] = df_time['BidDate'].dt.month
        df_time['DayOfWeek'] = df_time['BidDate'].dt.dayofweek
        df_time['Quarter'] = df_time['BidDate'].dt.quarter
        
        # Rolling statistics
        windows = [7, 30, 90]  # Days for rolling windows
        for window in windows:
            print(f"\nCalculating {window}-day rolling features...")
            
            # Rolling mean bid amount
            df_time[f'RollingMeanBid_{window}d'] = (
                df_time['BidAmount']
                .rolling(window=window, min_periods=1)
                .mean()
            )
            
            # Rolling win rate
            df_time[f'RollingWinRate_{window}d'] = (
                df_time['WinStatus']
                .rolling(window=window, min_periods=1)
                .mean()
            )
            
            # Rolling bid count
            df_time[f'RollingBidCount_{window}d'] = (
                df_time['BidAmount']
                .rolling(window=window, min_periods=1)
                .count()
            )
        
        # Fill any NaN values from rolling calculations
        rolling_cols = [col for col in df_time.columns if 'Rolling' in col]
        df_time[rolling_cols] = df_time[rolling_cols].fillna(method='bfill').fillna(0)
        
        print("\nTime features created:")
        print("\nBasic time features:", ['Year', 'Month', 'DayOfWeek', 'Quarter'])
        print("\nRolling features:", rolling_cols)
        
        return df_time
    
    except Exception as e:
        print(f"Error creating time features: {str(e)}")
        raise

# Create time-based features
try:
    df_features = create_time_features(df_encoded)
    print("\nTime-based features created successfully!")
except Exception as e:
    print(f"Failed to create time features: {str(e)}")
    raise


Calculating 7-day rolling features...

Calculating 30-day rolling features...

Calculating 90-day rolling features...

Time features created:

Basic time features: ['Year', 'Month', 'DayOfWeek', 'Quarter']

Rolling features: ['RollingMeanBid_7d', 'RollingWinRate_7d', 'RollingBidCount_7d', 'RollingMeanBid_30d', 'RollingWinRate_30d', 'RollingBidCount_30d', 'RollingMeanBid_90d', 'RollingWinRate_90d', 'RollingBidCount_90d']

Time-based features created successfully!


In [161]:
# Feature Selection and Matrix Preparation
def prepare_feature_matrices(df, categorical_cols):
    """Prepare feature matrices for training"""
    try:
        # Define feature columns
        time_cols = ['Year', 'Month', 'DayOfWeek', 'Quarter']
        rolling_cols = [col for col in df.columns if 'Rolling' in col]
        
        # Combine all feature columns
        feature_cols = categorical_cols + time_cols + rolling_cols
        print("\nSelected features:", feature_cols)
        
        # Create feature matrix X
        X = df[feature_cols].copy()
        
        # Create target variables
        y_reg = df['BidAmount']  # For regression (bid amount)
        
        # Basic feature statistics
        print("\nFeature Statistics:")
        print(X.describe().round(2))
        
        # Check for any remaining missing values
        missing = X.isnull().sum()
        if missing.any():
            print("\nWarning: Missing values detected in features:")
            print(missing[missing > 0])
            
        # Simple correlation analysis with target
        correlations = pd.DataFrame({
            'feature': feature_cols,
            'correlation': [X[col].corr(y_reg) for col in feature_cols]
        }).sort_values('correlation', key=abs, ascending=False)
        
        print("\nTop 10 feature correlations with bid amount:")
        print(correlations.head(10))
        
        return X, y_reg, feature_cols
    
    except Exception as e:
        print(f"Error preparing feature matrices: {str(e)}")
        raise

# Prepare feature matrices
try:
    X, y_reg, feature_cols = prepare_feature_matrices(df_features, categorical_cols)
    print("\nFeature matrices prepared successfully!")
except Exception as e:
    print(f"Failed to prepare feature matrices: {str(e)}")
    raise


Selected features: ['ProjectType', 'Location', 'ClientType', 'Year', 'Month', 'DayOfWeek', 'Quarter', 'RollingMeanBid_7d', 'RollingWinRate_7d', 'RollingBidCount_7d', 'RollingMeanBid_30d', 'RollingWinRate_30d', 'RollingBidCount_30d', 'RollingMeanBid_90d', 'RollingWinRate_90d', 'RollingBidCount_90d']

Feature Statistics:
       ProjectType  Location  ClientType     Year    Month  DayOfWeek  \
count      1000.00   1000.00     1000.00  1000.00  1000.00     1000.0   
mean          1.48      2.05        1.55  2024.90     6.10        3.0   
std           1.11      1.43        1.11     0.79     3.31        2.0   
min           0.00      0.00        0.00  2024.00     1.00        0.0   
25%           0.00      1.00        1.00  2024.00     3.00        1.0   
50%           2.00      2.00        2.00  2025.00     6.00        3.0   
75%           2.00      3.00        3.00  2026.00     9.00        5.0   
max           3.00      4.00        3.00  2026.00    12.00        6.0   

       Quarter  Roll

In [162]:
# Model Training and Cross-validation
def train_model(X, y, feature_cols):
    """Train XGBoost regressor with cross-validation"""
    try:
        # Initialize time-series cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        cv_scores = {
            'rmse': [],
            'mae': [],
            'r2': []
        }
        
        # Initialize model with robust parameters
        model = XGBRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=6,
            min_child_weight=2,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        )
        
        print("Starting cross-validation...")
        
        # Perform cross-validation
        for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            
            # Ensure non-negative target values
            y_train = y_train.clip(lower=0)
            y_test = y_test.clip(lower=0)
            
            print(f"\nFold {fold}:")
            print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
            
            # Train model
            model.fit(
                X_train, y_train,
                eval_set=[(X_train, y_train), (X_test, y_test)],
                verbose=False
            )
            
            # Make predictions
            y_pred = model.predict(X_test)
            y_pred = np.maximum(y_pred, 0)  # Ensure non-negative predictions
            
            # Calculate metrics
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            cv_scores['rmse'].append(rmse)
            cv_scores['mae'].append(mae)
            cv_scores['r2'].append(r2)
            
            print(f"RMSE: ${rmse:,.2f}")
            print(f"MAE: ${mae:,.2f}")
            print(f"R2: {r2:.3f}")
        
        # Print average scores
        print("\nCross-validation Average Results:")
        print(f"Average RMSE: ${np.mean(cv_scores['rmse']):,.2f} ± ${np.std(cv_scores['rmse']):,.2f}")
        print(f"Average MAE: ${np.mean(cv_scores['mae']):,.2f} ± ${np.std(cv_scores['mae']):,.2f}")
        print(f"Average R2: {np.mean(cv_scores['r2']):.3f} ± {np.std(cv_scores['r2']):.3f}")
        
        return model, cv_scores
    
    except Exception as e:
        print(f"Error in model training: {str(e)}")
        raise

# Train the model
try:
    model, cv_scores = train_model(X, y_reg, feature_cols)
    print("\nModel training completed successfully!")
except Exception as e:
    print(f"Failed to train model: {str(e)}")
    raise

Starting cross-validation...

Fold 1:
Train size: 170, Test size: 166
RMSE: $152,896.02
MAE: $93,466.27
R2: -0.063

Fold 2:
Train size: 336, Test size: 166
RMSE: $152,896.02
MAE: $93,466.27
R2: -0.063

Fold 2:
Train size: 336, Test size: 166
RMSE: $95,308.43
MAE: $64,744.55
R2: -0.215

Fold 3:
Train size: 502, Test size: 166
RMSE: $95,308.43
MAE: $64,744.55
R2: -0.215

Fold 3:
Train size: 502, Test size: 166
RMSE: $146,261.13
MAE: $84,980.97
R2: 0.133

Fold 4:
Train size: 668, Test size: 166
RMSE: $146,261.13
MAE: $84,980.97
R2: 0.133

Fold 4:
Train size: 668, Test size: 166
RMSE: $103,259.17
MAE: $71,340.92
R2: 0.009

Fold 5:
Train size: 834, Test size: 166
RMSE: $103,259.17
MAE: $71,340.92
R2: 0.009

Fold 5:
Train size: 834, Test size: 166
RMSE: $127,446.93
MAE: $82,243.77
R2: -0.014

Cross-validation Average Results:
Average RMSE: $125,034.34 ± $22,761.63
Average MAE: $79,355.30 ± $10,166.76
Average R2: -0.030 ± 0.113

Model training completed successfully!
RMSE: $127,446.93
MAE: $8

In [163]:
# Model Analysis and Diagnostics
def analyze_model(model, X, y, feature_cols):
    """Analyze model performance and generate diagnostics"""
    try:
        print("Analyzing model performance...")
        
        # Generate predictions on full dataset
        y_pred = model.predict(X)
        y_pred = np.maximum(y_pred, 0)  # Ensure non-negative predictions
        
        # Calculate error metrics
        mse = mean_squared_error(y, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        print("\nOverall Performance Metrics:")
        print(f"RMSE: ${rmse:,.2f}")
        print(f"MAE: ${mae:,.2f}")
        print(f"R2 Score: {r2:.3f}")
        
        # Feature importance analysis
        importance_df = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nTop 10 Most Important Features:")
        print(importance_df.head(10))
        
        # Error analysis
        errors = y - y_pred
        error_percentiles = np.percentile(np.abs(errors), [25, 50, 75, 90, 95, 99])
        
        print("\nError Distribution:")
        print(f"25th percentile: ${error_percentiles[0]:,.2f}")
        print(f"Median error: ${error_percentiles[1]:,.2f}")
        print(f"75th percentile: ${error_percentiles[2]:,.2f}")
        print(f"90th percentile: ${error_percentiles[3]:,.2f}")
        print(f"95th percentile: ${error_percentiles[4]:,.2f}")
        print(f"99th percentile: ${error_percentiles[5]:,.2f}")
        
        # Prediction calibration analysis
        pred_ranges = pd.qcut(y_pred, q=10)
        calibration_df = pd.DataFrame({
            'predicted': y_pred,
            'actual': y
        }).groupby(pred_ranges).agg({
            'predicted': ['mean', 'count'],
            'actual': 'mean'
        })
        
        print("\nPrediction Calibration Analysis:")
        print(calibration_df)
        
        # Save analysis results
        analysis_results = {
            'metrics': {
                'rmse': rmse,
                'mae': mae,
                'r2': r2
            },
            'feature_importance': importance_df,
            'error_percentiles': error_percentiles,
            'calibration': calibration_df
        }
        
        return analysis_results
    
    except Exception as e:
        print(f"Error in model analysis: {str(e)}")
        raise

# Analyze the model
try:
    analysis_results = analyze_model(model, X, y_reg, feature_cols)
    print("\nModel analysis completed successfully!")
except Exception as e:
    print(f"Failed to analyze model: {str(e)}")
    raise

Analyzing model performance...

Overall Performance Metrics:
RMSE: $60,072.46
MAE: $33,897.30
R2 Score: 0.778

Top 10 Most Important Features:
               feature  importance
0          ProjectType    0.122013
7    RollingMeanBid_7d    0.109138
5            DayOfWeek    0.103015
13  RollingMeanBid_90d    0.070718
11  RollingWinRate_30d    0.070135
2           ClientType    0.065170
4                Month    0.063771
10  RollingMeanBid_30d    0.062399
1             Location    0.061835
8    RollingWinRate_7d    0.060781

Error Distribution:
25th percentile: $9,506.50
Median error: $21,090.10
75th percentile: $40,710.43
90th percentile: $75,801.98
95th percentile: $99,934.75
99th percentile: $192,752.20

Prediction Calibration Analysis:
                               predicted               actual
                                    mean count           mean
(3214.966, 41457.214]       33527.226562   100   20763.656998
(41457.214, 50317.274]      46242.910156   100   33251.537094
(503

In [166]:
# Model Saving and Deployment Setup
def save_model_artifacts(model, config, artifacts):
    """Save model and related artifacts"""
    try:
        print("Saving model artifacts...")
        
        # Create deployment directory if it doesn't exist
        config['deploy_dir'].mkdir(parents=True, exist_ok=True)
        
        # Save model
        with open(config['model_path'], 'wb') as f:
            pickle.dump(model, f)
        print(f"Model saved to: {config['model_path']}")
        
        # Save encoders
        with open(config['encoders_path'], 'wb') as f:
            pickle.dump(artifacts['encoders'], f)
        print(f"Encoders saved to: {config['encoders_path']}")
        
        # Save metadata
        metadata = {
            'feature_cols': artifacts['feature_cols'],
            'categorical_cols': artifacts['categorical_cols'],
            'metrics': artifacts['analysis_results']['metrics'],
            'timestamp': datetime.now().isoformat(),
            'python_version': sys.version,
            'xgboost_version': xgb.__version__,
            'sklearn_version': pd.__version__
        }
        
        with open(config['metadata_path'], 'w') as f:
            json.dump(metadata, f, indent=4)
        print(f"Metadata saved to: {config['metadata_path']}")
        
        return True
    
    except Exception as e:
        print(f"Error saving model artifacts: {str(e)}")
        raise

# Create artifacts dictionary
artifacts = {
    'model': model,
    'encoders': encoders,
    'feature_cols': feature_cols,
    'categorical_cols': categorical_cols,
    'cv_scores': cv_scores,
    'analysis_results': analysis_results
}

# Save model and artifacts
try:
    # Build a deployment-style config expected by save_model_artifacts
    try:
        deploy_base = Path(config.get('paths', {}).get('deploy', Path('deployment')))
    except Exception:
        deploy_base = Path('deployment')

    models_dir = deploy_base / 'models'
    metadata_dir = deploy_base / 'config'

    # Ensure directories exist
    models_dir.mkdir(parents=True, exist_ok=True)
    metadata_dir.mkdir(parents=True, exist_ok=True)

    # Prepare config with the exact keys save_model_artifacts expects
    save_config = {
        'deploy_dir': deploy_base,
        'model_path': models_dir / 'bid_fee_model.pkl',
        'encoders_path': models_dir / 'encoders.pkl',
        'metadata_path': metadata_dir / 'metadata.json'
    }

    # Call saver
    success = save_model_artifacts(model, save_config, artifacts)
    if success:
        print("\nModel and artifacts saved successfully!")
        print("\nModel is ready for deployment with the following files:")
        # Print files and keep backward-compatible keys in the original config
        print(f"- Model: {save_config['model_path']}")
        print(f"- Encoders: {save_config['encoders_path']}")
        print(f"- Metadata: {save_config['metadata_path']}")
        # update original config dict so subsequent code referencing `config[...]` won't fail
        if isinstance(config, dict):
            config['model_path'] = save_config['model_path']
            config['encoders_path'] = save_config['encoders_path']
            config['metadata_path'] = save_config['metadata_path']
        print(f"- Encoders: {config['encoders_path']}")
        print(f"- Metadata: {config['metadata_path']}")
except Exception as e:
    print(f"Failed to save model artifacts: {str(e)}")
    raise

Saving model artifacts...
Model saved to: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\models\bid_fee_model.pkl
Encoders saved to: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\models\encoders.pkl
Metadata saved to: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\config\metadata.json

Model and artifacts saved successfully!

Model is ready for deployment with the following files:
- Model: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\models\bid_fee_model.pkl
- Encoders: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\models\encoders.pkl
- Metadata: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\config\metadata.json
- Encoders: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\models\encoders.pkl
- Metadata: c:\Users\prash\bid--recommendation\GSS Bid Models\deploy\config\metadata.json


# Bid Recommendation Model

This notebook implements a machine learning pipeline for bid recommendations, including:
1. Data loading and preprocessing
2. Feature engineering
3. Model training (regressor for bid amount, classifier for win probability)
4. Model evaluation
5. Production deployment helpers

## Setup Instructions
1. Install required packages (first cell)
2. Optional: Set up FRED API key for economic indicators
3. Run cells in order
4. Models will be saved to `models/` directory

## Notes
- Uses time-series cross-validation
- Handles missing values and outliers
- Includes feature importance analysis
- Provides production-ready inference class

In [167]:
# Cell 1: Install and import dependencies
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn fredapi shap --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

plt.style.use("seaborn-v0_8-whitegrid")



[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Cell 2: Load CSV
csv_path = r"D:\Global stat solutions\GSS\Bid Recommendation System\Prerequisites\bidrecommendation.csv"  # <-- yahan apne dataset ka naam daal
df = pd.read_csv(csv_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Dataset shape: (156733, 51)
Columns: ['BidId', 'BidFileNumber', 'BidName', 'BidDate', 'Bid_DueDate', 'BidFee', 'TargetTime', 'WinProbability', 'BidStatusName', 'Bid_JobPurpose', 'Bid_Deliverable', 'Market', 'Submarket', 'BusinessSegment', 'BusinessSegmentDetail', 'DistanceInMiles', 'Bid_Property_Type', 'PropertyId', 'OfficeId', 'PropertyName', 'PropertyType', 'SubType', 'PropertyCity', 'PropertyState', 'RooftopLongitude', 'RooftopLatitude', 'ZipCode', 'JobCount', 'IECount', 'LeaseCount', 'SaleCount', 'MarketOrientation', 'AddressDisplayCalc', 'GrossBuildingAreaRange', 'YearBuiltRange', 'OfficeCode', 'OfficeCompanyName', 'OfficeLocation', 'JobId', 'JobName', 'JobStatus', 'PopulationEstimate', 'AverageHouseValue', 'IncomePerHousehold', 'MedianAge', 'DeliveryTotal', 'NumberofBusinesses', 'NumberofEmployees', 'ZipPopulation', 'BidCompanyName', 'BidCompanyType']


Unnamed: 0,BidId,BidFileNumber,BidName,BidDate,Bid_DueDate,BidFee,TargetTime,WinProbability,BidStatusName,Bid_JobPurpose,...,PopulationEstimate,AverageHouseValue,IncomePerHousehold,MedianAge,DeliveryTotal,NumberofBusinesses,NumberofEmployees,ZipPopulation,BidCompanyName,BidCompanyType
0,225469,B179-2025-000742,36 US-87,9/7/2025,,,,,Active,,...,5225.0,285400.0,68767.0,48.700001,2970.0,153.0,1615.0,5868.0,Jefferson Bank,Banks / Credit Institutions
1,225471,B185-2025-000244,Chase Bank Ground Lease,9/7/2025,,,,,Active,,...,24723.0,481100.0,87434.0,36.599998,12443.0,1086.0,12795.0,23847.0,Integra Realty Resources - Dallas,Professional Service (PS)
2,225470,B179-2025-000743,Winston Hills Shopping Center,9/7/2025,,,,,Active,,...,44678.0,159900.0,49050.0,36.099998,18961.0,771.0,13868.0,40589.0,Texas Partners Bank,Banks / Credit Institutions
3,225383,B130-2025-000590,Vacant Residential Land,9/5/2025,9/15/2025,3700.0,10.0,,Won,,...,3900.0,177400.0,63476.0,44.400002,2276.0,70.0,299.0,3834.0,One Florida Bank,Banks / Credit Institutions
4,225385,B142-2025-000073,Somerset Apartments,9/5/2025,,4000.0,30.0,,Active,,...,35902.0,94000.0,45280.0,34.400002,17280.0,543.0,12051.0,33702.0,,


In [None]:
# Cell 3: Convert BidDate and extract time features
df['BidDate'] = pd.to_datetime(df['BidDate'], errors='coerce')
df = df.dropna(subset=['BidDate']).reset_index(drop=True)

df['BidWeek'] = df['BidDate'].dt.isocalendar().week
df['BidMonth'] = df['BidDate'].dt.month
df['BidYear'] = df['BidDate'].dt.year
df['BidWeekStart'] = df['BidDate'].dt.to_period('W').apply(lambda r: r.start_time)

print("Date range:", df['BidDate'].min(), "to", df['BidDate'].max())
df[['BidDate','BidWeek','BidMonth','BidYear']].head()


Date range: 2018-01-02 00:00:00 to 2025-09-07 00:00:00


Unnamed: 0,BidDate,BidWeek,BidMonth,BidYear
0,2025-09-07,36,9,2025
1,2025-09-07,36,9,2025
2,2025-09-07,36,9,2025
3,2025-09-05,36,9,2025
4,2025-09-05,36,9,2025


In [None]:
# Cell 4: Aggregate median BidFee per ZIP-week
agg = df.groupby(['ZipCode','BidWeekStart']).agg(
    median_BidFee=('BidFee','median'),
    week_win_rate=('WinProbability','mean')
).reset_index()

print("Weekly agg shape:", agg.shape)
agg.head()


Weekly agg shape: (68067, 4)


Unnamed: 0,ZipCode,BidWeekStart,median_BidFee,week_win_rate
0,0,2018-02-12,4200.0,
1,0,2018-02-19,4800.0,
2,0,2018-02-26,5000.0,
3,0,2018-03-26,3250.0,
4,0,2018-04-09,1612.5,


In [None]:
# Cell 5: Lag and Rolling Features
for lag in [1,2,3,4]:
    agg[f'lag_{lag}'] = agg.groupby('ZipCode')['median_BidFee'].shift(lag)

agg['roll_4w'] = (agg.groupby('ZipCode')['median_BidFee']
                  .shift(1).rolling(window=4,min_periods=1).mean()
                  .reset_index(level=0,drop=True))

agg = agg.dropna(subset=['lag_1','lag_2','lag_3'])
print("After lag creation:", agg.shape)
agg.head()


After lag creation: (48470, 9)


Unnamed: 0,ZipCode,BidWeekStart,median_BidFee,week_win_rate,lag_1,lag_2,lag_3,lag_4,roll_4w
3,0,2018-03-26,3250.0,,5000.0,4800.0,4200.0,,4666.666667
4,0,2018-04-09,1612.5,,3250.0,5000.0,4800.0,4200.0,4312.5
5,0,2018-05-21,4668.75,,1612.5,3250.0,5000.0,4800.0,3665.625
6,0,2018-07-09,3500.0,,4668.75,1612.5,3250.0,5000.0,3632.8125
7,0,2018-07-30,2000.0,,3500.0,4668.75,1612.5,3250.0,3257.8125


In [None]:
# Feature Engineering

print("Starting feature engineering...")

# 0. Clean target variable
print("\nCleaning target variable...")
if 'BidFee' in df.columns:
    print(f"Initial BidFee range: ${df['BidFee'].min():.2f} to ${df['BidFee'].max():.2f}")
    print(f"NaN values in BidFee: {df['BidFee'].isna().sum()}")
    print(f"Infinite values in BidFee: {np.isinf(df['BidFee']).sum()}")
    
    # Remove rows with invalid BidFee
    df = df[~df['BidFee'].isna()]
    df = df[~np.isinf(df['BidFee'])]
    df = df[df['BidFee'] >= 0]  # Remove negative fees
    df = df[df['BidFee'] <= df['BidFee'].quantile(0.99)]  # Remove extreme outliers
    
    print(f"\nAfter cleaning:")
    print(f"BidFee range: ${df['BidFee'].min():.2f} to ${df['BidFee'].max():.2f}")
    print(f"Number of rows: {len(df)}")

print("\nAvailable columns:", ', '.join(df.columns))

# 1. Handle categorical columns
print("\nEncoding categorical columns...")
categorical_cols = [
    'ZipCode', 
    'PropertyType',
    'SubType',
    'PropertyState',
    'Market',
    'Submarket',
    'BusinessSegment',
    'BidCompanyType'
]
cat_cols = [col for col in categorical_cols if col in df.columns]
print(f"Processing categorical columns: {cat_cols}")

encoders = {}
for col in cat_cols:
    if col in df.columns:
        encoders[col] = LabelEncoder()
        df[f'{col}_encoded'] = encoders[col].fit_transform(df[col].astype(str).fillna('MISSING'))

# 2. Create time-based features
print("\nCreating time-based features...")
if 'BidDate' in df.columns:
    df['BidDate'] = pd.to_datetime(df['BidDate'])
    df['Year'] = df['BidDate'].dt.year
    df['Month'] = df['BidDate'].dt.month
    df['Week'] = df['BidDate'].dt.isocalendar().week
    df['DayOfWeek'] = df['BidDate'].dt.dayofweek

# 3. Calculate market-based features
print("\nCalculating market-based features...")
market_cols = [
    'PopulationEstimate',
    'AverageHouseValue',
    'IncomePerHousehold',
    'MedianAge',
    'NumberofBusinesses',
    'NumberofEmployees',
    'ZipPopulation'
]

for col in market_cols:
    if col in df.columns:
        # Calculate ZIP-wise statistics
        df[f'{col}_zip_ratio'] = df[col] / df.groupby('ZipCode')[col].transform('mean')

# 4. Calculate rolling statistics by ZIP code
print("\nCalculating rolling statistics...")
if 'BidDate' in df.columns and 'ZipCode' in df.columns:
    df = df.sort_values(['ZipCode', 'BidDate'])
    
    for window in [7, 30, 90]:  # Days-based windows
        # Bid fee statistics
        if 'BidFee' in df.columns:
            df[f'rolling_{window}d_mean_fee'] = df.groupby('ZipCode')['BidFee'].transform(
                lambda x: x.rolling(window, min_periods=1).mean()
            )
            df[f'rolling_{window}d_median_fee'] = df.groupby('ZipCode')['BidFee'].transform(
                lambda x: x.rolling(window, min_periods=1).median()
            )
        
        # Win probability statistics if available
        if 'WinProbability' in df.columns:
            df[f'rolling_{window}d_win_rate'] = df.groupby('ZipCode')['WinProbability'].transform(
                lambda x: x.rolling(window, min_periods=1).mean()
            )

# 5. Business type and property features
print("\nCreating business and property features...")
if 'BidCompanyType' in df.columns:
    # Company type frequency encoding
    type_freq = df['BidCompanyType'].value_counts(normalize=True).to_dict()
    df['company_type_freq'] = df['BidCompanyType'].map(type_freq)

if 'PropertyType' in df.columns:
    # Property type frequency encoding
    prop_freq = df['PropertyType'].value_counts(normalize=True).to_dict()
    df['property_type_freq'] = df['PropertyType'].map(prop_freq)

# 6. Distance and location features
if 'DistanceInMiles' in df.columns:
    df['distance_bucket'] = pd.qcut(df['DistanceInMiles'], q=5, labels=['VeryClose', 'Close', 'Medium', 'Far', 'VeryFar'])
    df['distance_bucket_encoded'] = LabelEncoder().fit_transform(df['distance_bucket'])

# 7. Handle missing values
print("\nHandling missing values...")
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df[col].isna().any():
        # Fill with median by group if possible
        try:
            df[col] = df.groupby('ZipCode')[col].transform(
                lambda x: x.fillna(x.median())
            )
        except:
            df[col] = df[col].fillna(df[col].median())

# 8. Prepare feature matrix
print("\nPreparing feature matrix...")
base_features = [
    # Encoded categoricals
    *[f'{col}_encoded' for col in cat_cols],
    
    # Time features
    'Year', 'Month', 'Week', 'DayOfWeek',
    
    # Market features
    *[f'{col}_zip_ratio' for col in market_cols if f'{col}_zip_ratio' in df.columns],
    
    # Rolling stats
    *[col for col in df.columns if col.startswith('rolling_')],
    
    # Business and property features
    'company_type_freq', 'property_type_freq',
    
    # Original numeric features
    'DistanceInMiles', 'JobCount', 'IECount', 'LeaseCount', 'SaleCount',
    'PopulationEstimate', 'AverageHouseValue', 'IncomePerHousehold',
    'MedianAge', 'NumberofBusinesses', 'NumberofEmployees'
]

# Only use features that exist
feature_cols = [col for col in base_features if col in df.columns]
print("\nAvailable features:", ', '.join(feature_cols))

# Create feature matrix
X = df[feature_cols].copy()
print(f"\nFeature matrix shape: {X.shape}")

# 9. Prepare target variables
print("\nPreparing target variables...")
if 'BidFee' in df.columns:
    target_reg = 'BidFee'
    y_reg = df[target_reg]
    print(f"Regression target: {target_reg}")
    print(f"Target range: ${y_reg.min():.2f} to ${y_reg.max():.2f}")
else:
    raise ValueError("BidFee column not found - required for training!")

# Classifier target if available
if 'WinProbability' in df.columns:
    y_clf = (df['WinProbability'] > df['WinProbability'].median()).astype(int)
    print("Classification target available")
else:
    y_clf = None
    print("No classification target available")

# Save variables to global scope
globals().update({
    'X': X,
    'y_reg': y_reg,
    'y_clf': y_clf,
    'target_reg': target_reg,
    'feature_cols': feature_cols,
    'encoders': encoders,
    'cat_cols': cat_cols
})

print("\nFeature engineering complete.")

Starting feature engineering...

Cleaning target variable...
Initial BidFee range: $0.00 to $100000000.00
NaN values in BidFee: 2861
Infinite values in BidFee: 0

After cleaning:
BidFee range: $0.00 to $15000.00
Number of rows: 152447

Available columns: BidId, BidFileNumber, BidName, BidDate, Bid_DueDate, BidFee, TargetTime, WinProbability, BidStatusName, Bid_JobPurpose, Bid_Deliverable, Market, Submarket, BusinessSegment, BusinessSegmentDetail, DistanceInMiles, Bid_Property_Type, PropertyId, OfficeId, PropertyName, PropertyType, SubType, PropertyCity, PropertyState, RooftopLongitude, RooftopLatitude, ZipCode, JobCount, IECount, LeaseCount, SaleCount, MarketOrientation, AddressDisplayCalc, GrossBuildingAreaRange, YearBuiltRange, OfficeCode, OfficeCompanyName, OfficeLocation, JobId, JobName, JobStatus, PopulationEstimate, AverageHouseValue, IncomePerHousehold, MedianAge, DeliveryTotal, NumberofBusinesses, NumberofEmployees, ZipPopulation, BidCompanyName, BidCompanyType, BidWeek, BidMon

In [None]:
# Model Training and Validation

print("Starting model training...")

# 0. Data validation
print("\nValidating input data...")
if len(X) == 0:
    raise ValueError("Empty feature matrix")
if len(y_reg) == 0:
    raise ValueError("Empty target variable")
if X.isna().any().any():
    print("Warning: NaN values found in features. Filling with 0...")
    X = X.fillna(0)
if y_reg.isna().any():
    print("Warning: NaN values found in target. Removing...")
    valid_idx = ~y_reg.isna()
    X = X.loc[valid_idx]
    y_reg = y_reg.loc[valid_idx]

print(f"Training with {len(X)} samples and {X.shape[1]} features")
print(f"Target range: ${y_reg.min():.2f} to ${y_reg.max():.2f}")

# 1. Define time-based cross-validation
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = {
    'rmse': [],
    'mae': [],
    'r2': []
}

# 2. Initialize model with robust parameters
xgb_reg = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Sort data by date for time-based splitting
if 'BidDate' in df.columns:
    sort_idx = df['BidDate'].sort_values().index
    X = X.loc[sort_idx]
    y_reg = y_reg.loc[sort_idx]

print("\nPerforming cross-validation...")
for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
    try:
        # Split data
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y_reg.iloc[train_idx], y_reg.iloc[test_idx]
        
        # Ensure positive target values
        y_train = y_train.clip(lower=0)
        y_test = y_test.clip(lower=0)
        
        # Train model
        xgb_reg.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=False
        )
        
        # Make predictions
        y_pred = xgb_reg.predict(X_test)
        
        # Ensure non-negative predictions
        y_pred = np.maximum(y_pred, 0)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        cv_scores['rmse'].append(rmse)
        cv_scores['mae'].append(mae)
        cv_scores['r2'].append(r2)
        
        print(f"\nFold {fold} Results:")
        print(f"RMSE: ${rmse:.2f}")
        print(f"MAE: ${mae:.2f}")
        print(f"R2 Score: {r2:.3f}")
    
    except Exception as e:
        print(f"Error in fold {fold}: {str(e)}")
        continue

if len(cv_scores['rmse']) == 0:
    raise ValueError("All cross-validation folds failed")

# Print average cross-validation scores
print("\nCross-validation Average Results:")
print(f"Average RMSE: ${np.mean(cv_scores['rmse']):.2f} ± ${np.std(cv_scores['rmse']):.2f}")
print(f"Average MAE: ${np.mean(cv_scores['mae']):.2f} ± ${np.std(cv_scores['mae']):.2f}")
print(f"Average R2: {np.mean(cv_scores['r2']):.3f} ± {np.std(cv_scores['r2']):.3f}")

# Train final model on full dataset
print("\nTraining final model on full dataset...")
try:
    # Ensure positive target values
    y_reg_clean = y_reg.clip(lower=0)
    
    # Train final model
    xgb_reg.fit(
        X, y_reg_clean,
        eval_set=[(X, y_reg_clean)],
        verbose=False
    )
    
    # Feature importance
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': xgb_reg.feature_importances_
    })
    importance_df = importance_df.sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))
    
    # Calculate error distribution
    y_pred_full = xgb_reg.predict(X)
    y_pred_full = np.maximum(y_pred_full, 0)  # Ensure non-negative predictions
    errors = y_reg_clean - y_pred_full
    error_percentiles = np.percentile(np.abs(errors), [25, 50, 75, 90, 95, 99])
    
    print("\nError Distribution:")
    print(f"25th percentile: ${error_percentiles[0]:.2f}")
    print(f"Median error: ${error_percentiles[1]:.2f}")
    print(f"75th percentile: ${error_percentiles[2]:.2f}")
    print(f"90th percentile: ${error_percentiles[3]:.2f}")
    print(f"95th percentile: ${error_percentiles[4]:.2f}")
    print(f"99th percentile: ${error_percentiles[5]:.2f}")
    
    # Calculate prediction calibration
    pred_ranges = pd.qcut(y_pred_full, q=10)
    calibration_df = pd.DataFrame({
        'predicted': y_pred_full,
        'actual': y_reg_clean
    }).groupby(pred_ranges).agg({
        'predicted': ['mean', 'count'],
        'actual': 'mean'
    })
    
    print("\nPrediction Calibration:")
    print(calibration_df)
    
    # Save model artifacts
    artifacts = {
        'model': xgb_reg,
        'feature_cols': feature_cols,
        'encoders': encoders,
        'cat_cols': cat_cols,
        'cv_scores': cv_scores,
        'feature_importance': importance_df,
        'error_percentiles': error_percentiles,
        'preprocessing_params': {
            'categorical_cols': categorical_cols,
            'market_cols': market_cols
        }
    }
    
    # Save variables to global scope
    globals().update({
        'xgb_reg': xgb_reg,
        'artifacts': artifacts,
        'importance_df': importance_df,
        'cv_scores': cv_scores,
        'error_percentiles': error_percentiles
    })
    
    print("\nModel training completed successfully!")
    
except Exception as e:
    raise RuntimeError(f"Error training final model: {str(e)}")

Starting model training...

Validating input data...
Training with 152447 samples and 41 features
Target range: $0.00 to $15000.00



Performing cross-validation...

Fold 1 Results:
RMSE: $1341.86
MAE: $826.27
R2 Score: 0.470

Fold 1 Results:
RMSE: $1341.86
MAE: $826.27
R2 Score: 0.470

Fold 2 Results:
RMSE: $1415.53
MAE: $903.73
R2 Score: 0.416

Fold 2 Results:
RMSE: $1415.53
MAE: $903.73
R2 Score: 0.416

Fold 3 Results:
RMSE: $1475.95
MAE: $961.08
R2 Score: 0.377

Fold 3 Results:
RMSE: $1475.95
MAE: $961.08
R2 Score: 0.377

Fold 4 Results:
RMSE: $1378.92
MAE: $845.23
R2 Score: 0.422

Fold 4 Results:
RMSE: $1378.92
MAE: $845.23
R2 Score: 0.422

Fold 5 Results:
RMSE: $1372.13
MAE: $841.97
R2 Score: 0.437

Cross-validation Average Results:
Average RMSE: $1396.88 ± $45.96
Average MAE: $875.66 ± $50.17
Average R2: 0.424 ± 0.030

Training final model on full dataset...

Fold 5 Results:
RMSE: $1372.13
MAE: $841.97
R2 Score: 0.437

Cross-validation Average Results:
Average RMSE: $1396.88 ± $45.96
Average MAE: $875.66 ± $50.17
Average R2: 0.424 ± 0.030

Training final model on full dataset...

Top 10 Most Important Feature

In [None]:
# Production Deployment Setup

import joblib
import os
import json
import datetime
from pathlib import Path

# Create deployment directories
deploy_dir = Path('deployment')
model_dir = deploy_dir / 'models'
config_dir = deploy_dir / 'config'

for d in [deploy_dir, model_dir, config_dir]:
    d.mkdir(parents=True, exist_ok=True)

print("Setting up production deployment...")

# 1. Save model and artifacts
print("\nSaving model artifacts...")

# Save main model
model_path = model_dir / 'bid_fee_model.joblib'
joblib.dump(artifacts['model'], model_path)
print(f"Model saved to {model_path}")

# Save encoders and configuration
config = {
    'feature_cols': artifacts['feature_cols'],
    'categorical_cols': artifacts['preprocessing_params']['categorical_cols'],
    'market_cols': artifacts['preprocessing_params']['market_cols'],
    'model_info': {
        'type': type(artifacts['model']).__name__,
        'version': '1.0.0',
        'created_at': datetime.datetime.now().isoformat(),
        'python_version': platform.python_version(),
        'package_versions': {
            'xgboost': xgb.__version__,
            'scikit-learn': sklearn.__version__,
            'pandas': pd.__version__,
            'numpy': np.__version__
        }
    },
    'performance': {
        'cv_scores': {
            'rmse_mean': float(np.mean(artifacts['cv_scores']['rmse'])),
            'rmse_std': float(np.std(artifacts['cv_scores']['rmse'])),
            'mae_mean': float(np.mean(artifacts['cv_scores']['mae'])),
            'mae_std': float(np.std(artifacts['cv_scores']['mae'])),
            'r2_mean': float(np.mean(artifacts['cv_scores']['r2'])),
            'r2_std': float(np.std(artifacts['cv_scores']['r2']))
        },
        'error_percentiles': {
            'p25': float(artifacts['error_percentiles'][0]),
            'p50': float(artifacts['error_percentiles'][1]),
            'p75': float(artifacts['error_percentiles'][2]),
            'p90': float(artifacts['error_percentiles'][3]),
            'p95': float(artifacts['error_percentiles'][4]),
            'p99': float(artifacts['error_percentiles'][5])
        }
    },
    'feature_importance': artifacts['feature_importance'].to_dict(orient='records')[:10]  # Top 10 features
}

# Save configuration
config_path = config_dir / 'model_config.json'
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print(f"Model configuration saved to {config_path}")

# Save encoders
encoders_path = model_dir / 'encoders.joblib'
joblib.dump(artifacts['encoders'], encoders_path)
print(f"Encoders saved to {encoders_path}")

# 2. Create prediction class for production
print("\nCreating prediction class...")

class BidPrediction:
    """Production-ready bid fee prediction class"""
    
    def __init__(self, model_dir: str = 'models'):
        self.model_dir = Path(model_dir)
        self._load_artifacts()
        
    def _load_artifacts(self):
        """Load model and related artifacts"""
        try:
            # Load model
            self.model = joblib.load(self.model_dir / 'bid_fee_model.joblib')
            
            # Load encoders
            self.encoders = joblib.load(self.model_dir / 'encoders.joblib')
            
            # Load configuration
            with open(config_dir / 'model_config.json', 'r') as f:
                self.config = json.load(f)
            
            self.feature_cols = self.config['feature_cols']
            self.categorical_cols = self.config['categorical_cols']
            self.market_cols = self.config['market_cols']
            
        except Exception as e:
            raise RuntimeError(f"Failed to load model artifacts: {str(e)}")
    
    def _prepare_features(self, data: dict) -> pd.DataFrame:
        """Prepare features for prediction"""
        # Convert to DataFrame
        df = pd.DataFrame([data])
        
        # Process date features
        if 'BidDate' in df.columns:
            df['BidDate'] = pd.to_datetime(df['BidDate'])
            df['Year'] = df['BidDate'].dt.year
            df['Month'] = df['BidDate'].dt.month
            df['Week'] = df['BidDate'].dt.isocalendar().week
            df['DayOfWeek'] = df['BidDate'].dt.dayofweek
        
        # Process categorical columns
        for col in self.categorical_cols:
            if col in df.columns and col in self.encoders:
                df[f'{col}_encoded'] = df[col].astype(str).map(
                    lambda x: self.encoders[col].transform([x])[0] 
                    if x in self.encoders[col].classes_ else -1
                )
        
        # Calculate market-based features
        for col in self.market_cols:
            if col in df.columns:
                df[f'{col}_zip_ratio'] = 1.0  # Default for single predictions
        
        # Ensure all feature columns exist
        for col in self.feature_cols:
            if col not in df.columns:
                df[col] = 0
        
        return df[self.feature_cols]
    
    def predict(self, data: dict) -> dict:
        """
        Make a bid fee prediction
        
        Args:
            data: Dictionary containing bid opportunity data
            
        Returns:
            Dictionary with prediction and confidence info
        """
        try:
            # Validate required fields
            required_fields = {'ZipCode', 'PropertyType', 'BidDate'}
            missing = required_fields - set(data.keys())
            if missing:
                raise ValueError(f"Missing required fields: {missing}")
            
            # Prepare features
            X = self._prepare_features(data)
            
            # Make prediction
            pred = float(self.model.predict(X)[0])
            
            # Get prediction probabilities
            pred_proba = self.model.predict_proba(X)[0] if hasattr(self.model, 'predict_proba') else None
            confidence = float(pred_proba.max()) if pred_proba is not None else 0.8
            
            # Get error bounds from config
            error_percentiles = self.config['performance']['error_percentiles']
            
            return {
                'predicted_fee': pred,
                'confidence_score': confidence,
                'prediction_interval': {
                    'lower_bound': max(0, pred - error_percentiles['p90']),
                    'upper_bound': pred + error_percentiles['p90']
                },
                'model_version': self.config['model_info']['version'],
                'timestamp': datetime.datetime.now().isoformat()
            }
            
        except Exception as e:
            raise RuntimeError(f"Prediction failed: {str(e)}")

# 3. Test the production class
print("\nTesting prediction class...")

predictor = BidPrediction('deployment/models')

# Test with sample data
sample_data = {
    'ZipCode': '12345',
    'PropertyType': 'Office',
    'BidDate': '2025-10-23',
    'DistanceInMiles': 10.5,
    'Market': 'NYC',
    'PopulationEstimate': 50000,
    'AverageHouseValue': 500000
}

try:
    result = predictor.predict(sample_data)
    print("\nTest prediction successful!")
    print(f"Predicted bid fee: ${result['predicted_fee']:.2f}")
    print(f"Confidence score: {result['confidence_score']:.2%}")
    print(f"90% Prediction interval: ${result['prediction_interval']['lower_bound']:.2f} to ${result['prediction_interval']['upper_bound']:.2f}")
    
except Exception as e:
    print(f"Test prediction failed: {str(e)}")

# Save predictor to global scope
globals().update({
    'predictor': predictor
})

Setting up production deployment...

Saving model artifacts...
Model saved to deployment\models\bid_fee_model.joblib
Model configuration saved to deployment\config\model_config.json
Encoders saved to deployment\models\encoders.joblib

Creating prediction class...

Testing prediction class...

Test prediction successful!
Predicted bid fee: $2678.39
Confidence score: 80.00%
90% Prediction interval: $911.13 to $4445.64

Test prediction successful!
Predicted bid fee: $2678.39
Confidence score: 80.00%
90% Prediction interval: $911.13 to $4445.64


In [None]:
# Model Validation and Testing

print("Running comprehensive model validation...")

# 1. Test data representation
print("\nValidating data representation...")
test_cases = [
    {
        'case': 'Basic Office Property',
        'data': {
            'ZipCode': '12345',
            'PropertyType': 'Office',
            'BidDate': '2025-10-23',
            'DistanceInMiles': 10.5,
            'Market': 'NYC'
        }
    },
    {
        'case': 'Complex Property with Market Data',
        'data': {
            'ZipCode': '90210',
            'PropertyType': 'Retail',
            'BidDate': '2025-10-23',
            'DistanceInMiles': 5.2,
            'Market': 'LA',
            'PopulationEstimate': 75000,
            'AverageHouseValue': 1500000,
            'IncomePerHousehold': 120000
        }
    },
    {
        'case': 'Edge Case with Missing Fields',
        'data': {
            'ZipCode': '54321',
            'PropertyType': 'Industrial',
            'BidDate': '2025-10-23',
            'DistanceInMiles': 15.7
        }
    }
]

print("\nTesting different property scenarios...")
for test in test_cases:
    print(f"\nTesting {test['case']}:")
    try:
        result = predictor.predict(test['data'])
        print(f"Prediction: ${result['predicted_fee']:.2f}")
        print(f"Confidence: {result['confidence_score']:.2%}")
        print(f"Range: ${result['prediction_interval']['lower_bound']:.2f} to ${result['prediction_interval']['upper_bound']:.2f}")
    except Exception as e:
        print(f"Error: {str(e)}")

# 2. Test error handling
print("\nTesting error handling...")
error_cases = [
    {
        'case': 'Missing Required Field',
        'data': {'PropertyType': 'Office', 'BidDate': '2025-10-23'}
    },
    {
        'case': 'Invalid Date Format',
        'data': {
            'ZipCode': '12345',
            'PropertyType': 'Office',
            'BidDate': 'invalid-date'
        }
    },
    {
        'case': 'Unknown Property Type',
        'data': {
            'ZipCode': '12345',
            'PropertyType': 'Unknown',
            'BidDate': '2025-10-23'
        }
    }
]

for test in error_cases:
    print(f"\nTesting {test['case']}:")
    try:
        result = predictor.predict(test['data'])
        print("Warning: Expected error but got result:")
        print(result)
    except Exception as e:
        print(f"Expected error caught: {str(e)}")

# 3. Validate model artifacts
print("\nValidating model artifacts...")
required_files = [
    'deployment/models/bid_fee_model.joblib',
    'deployment/models/encoders.joblib',
    'deployment/config/model_config.json'
]

for file in required_files:
    if os.path.exists(file):
        print(f"✓ {file} exists")
        print(f"  Size: {os.path.getsize(file) / 1024:.1f} KB")
        print(f"  Modified: {datetime.datetime.fromtimestamp(os.path.getmtime(file)).isoformat()}")
    else:
        print(f"✗ Missing required file: {file}")

# 4. Load and validate configuration
try:
    with open('deployment/config/model_config.json', 'r') as f:
        config = json.load(f)
    print("\nModel configuration loaded successfully:")
    print(f"Version: {config['model_info']['version']}")
    print(f"Created: {config['model_info']['created_at']}")
    print("\nModel Performance:")
    print(f"Average RMSE: ${config['performance']['cv_scores']['rmse_mean']:.2f}")
    print(f"Average MAE: ${config['performance']['cv_scores']['mae_mean']:.2f}")
    print(f"Average R²: {config['performance']['cv_scores']['r2_mean']:.3f}")
except Exception as e:
    print(f"Error validating configuration: {str(e)}")

print("\nValidation complete!")

Running comprehensive model validation...

Validating data representation...

Testing different property scenarios...

Testing Basic Office Property:
Prediction: $2580.26
Confidence: 80.00%
Range: $813.01 to $4347.52

Testing Complex Property with Market Data:
Prediction: $2381.43
Confidence: 80.00%
Range: $614.18 to $4148.69

Testing Edge Case with Missing Fields:
Prediction: $2813.36Prediction: $2813.36
Confidence: 80.00%
Range: $1046.11 to $4580.61

Testing error handling...

Testing Missing Required Field:
Expected error caught: Prediction failed: Missing required fields: {'ZipCode'}

Testing Invalid Date Format:
Expected error caught: Prediction failed: Unknown datetime string format, unable to parse: invalid-date, at position 0

Testing Unknown Property Type:
{'predicted_fee': 3446.03662109375, 'confidence_score': 0.8, 'prediction_interval': {'lower_bound': 1678.7840820312501, 'upper_bound': 5213.28916015625}, 'model_version': '1.0.0', 'timestamp': '2025-10-23T13:51:56.845221'}



In [None]:
# Helper function for finding optimal fee
def find_optimal_fee(opportunity_row, base_multiplier=2.0, steps=100):
    """
    Find optimal bid fee for an opportunity
    
    Args:
        opportunity_row: pandas Series with features
        base_multiplier: How much above/below base fee to search
        steps: Number of fee values to evaluate
    
    Returns:
        dict with optimal fee and expected value
    """
    # Get base fee for scaling
    base_fee = opportunity_row['BaseFee']
    if np.isnan(base_fee) or base_fee <= 0:
        base_fee = df['BaseFee'].median()  # Fallback to median
        
    # Generate fee range to evaluate
    min_fee = max(1.0, base_fee / base_multiplier)  # Minimum $1 fee
    max_fee = base_fee * base_multiplier
    fee_range = np.linspace(min_fee, max_fee, steps)
    
    # Initialize arrays for results
    win_probs = np.zeros_like(fee_range)
    evs = np.zeros_like(fee_range)
    
    # Create feature matrix for each fee
    X_eval = pd.concat([
        pd.DataFrame([opportunity_row.to_dict()] * len(fee_range))
        .assign(BidFee=fee_range)
    ], axis=1)
    
    # Get win probabilities if classifier exists
    if clf is not None:
        win_probs = clf.predict_proba(X_eval)[:,1]
    else:
        win_probs = np.full_like(fee_range, fallback_winprob)
        
    # Calculate expected values
    evs = win_probs * fee_range
    
    # Find optimal point
    best_idx = np.argmax(evs)
    optimal_fee = fee_range[best_idx]
    optimal_ev = evs[best_idx]
    optimal_prob = win_probs[best_idx]
    
    return {
        'best_fee': optimal_fee,
        'best_ev': optimal_ev,
        'win_prob': optimal_prob,
        'fee_grid': fee_range,
        'win_probs': win_probs,
        'evs': evs
    }

In [None]:
# Build feature matrix X and targets
drop_cols = ['ZipCode','BidWeekStart']
target_reg = 'median_BidFee'
features = [c for c in agg.columns if c not in drop_cols + [target_reg]]
features = sorted(features)
print('Feature count:', len(features))

# Prepare matrix X and target y
X = agg[features].copy()
y_reg = agg[target_reg].copy()

# Build classifier labels if week_win_rate exists
if 'week_win_rate' in agg.columns and agg['week_win_rate'].notna().sum() > 0:
    threshold = agg['week_win_rate'].median()
    y_clf = (agg['week_win_rate'] > threshold).astype(int)
    print('Using week_win_rate median threshold:', threshold)
else:
    y_clf = None
    print('No week_win_rate available for classifier')

# Frequency-encode any remaining object columns
encoders = {}
for c in X.columns:
    if X[c].dtype == 'object' or X[c].dtype.name == 'category':
        vc = X[c].astype(str).fillna('MISSING').value_counts(normalize=True)
        encoders[c] = vc.to_dict()
        X[c] = X[c].astype(str).fillna('MISSING').map(encoders[c]).fillna(0.0)

# Fill numeric nans with medians
train_medians = X.median()
X = X.fillna(train_medians)

# Persist pieces for downstream cells
globals().update({
    'features': features,
    'encoders': encoders,
    'train_medians': train_medians,
    'agg': agg,
    'X': X,
    'y_reg': y_reg,
    'y_clf': y_clf
})

Feature count: 6
No week_win_rate available for classifier


In [None]:
# Production Setup: Save models and create inference wrapper
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save regressor model
joblib.dump(model_full, 'models/regressor.joblib')
print("Saved regressor model")

# Save classifier if available
if 'clf' in globals() and clf is not None:
    joblib.dump(clf, 'models/classifier.joblib')
    print("Saved classifier model")
    fallback_winprob = 0.5  # default win probability when classifier unavailable
else:
    fallback_winprob = y_clf.mean() if y_clf is not None else 0.5

# Save artifacts for inference
artifacts = {
    'feature_names': list(X.columns),
    'target_reg': target_reg,
    'has_classifier': 'clf' in globals() and clf is not None,
    'fallback_winprob': fallback_winprob
}
joblib.dump(artifacts, 'models/bid_recommendation_artifacts.joblib')
print("\nSaved model artifacts to models/bid_recommendation_artifacts.joblib")

# Create inference wrapper class
class BidPrediction:
    def __init__(self):
        self.regressor = joblib.load('models/regressor.joblib')
        try:
            self.classifier = joblib.load('models/classifier.joblib')
            self.has_classifier = True
        except:
            self.classifier = None
            self.has_classifier = False
        
        # Load artifacts
        self.artifacts = joblib.load('models/bid_recommendation_artifacts.joblib')
        self.feature_names = self.artifacts['feature_names']
        self.target_reg = self.artifacts['target_reg']
        self.fallback_winprob = self.artifacts['fallback_winprob']
    
    def validate_features(self, data):
        """Validate input features"""
        missing = set(self.feature_names) - set(data.columns)
        if missing:
            raise ValueError(f"Missing features: {missing}")
        extra = set(data.columns) - set(self.feature_names)
        if extra:
            raise ValueError(f"Unexpected features: {extra}")
            
        return data[self.feature_names]
    
    def predict(self, data):
        """
        Make prediction for new data
        Args:
            data: pandas DataFrame with required features
        Returns:
            dict with predictions
        """
        # Validate features
        X_new = self.validate_features(data)
        
        # Clean features
        X_new = X_new.replace([np.inf, -np.inf], np.nan)
        X_new = X_new.fillna(X_new.median())
        
        # Make predictions
        bid_pred = float(self.regressor.predict(X_new)[0])
        result = {'predicted_bid': bid_pred}
        
        # Add win probability if classifier exists
        if self.has_classifier:
            win_prob = float(self.classifier.predict_proba(X_new)[:,1][0])
            result['win_probability'] = win_prob
        else:
            result['win_probability'] = self.fallback_winprob
            
        return result

# Print example usage & validation
print("\nExample usage & validation:\n")

# 1. Create test data
zip_sample = df['ZipCode'].sample(n=1).iloc[0]  # random ZIP
print(f"Testing with ZIP code: {zip_sample}")

sample_opportunity = df[df['ZipCode']==zip_sample].sort_values('BidWeekStart').iloc[-1]

# 2. Get predictions
predictor = BidPrediction()
prediction = predictor.predict(pd.DataFrame([sample_opportunity]))

print("\nPredictions:")
print(f"Predicted bid: ${prediction['predicted_bid']:.2f}")
print(f"Win probability: {prediction['win_probability']:.1%}")

# 3. Get bid recommendation with EV optimization
recommendations = find_optimal_fee(sample_opportunity)

print("\nRecommendations:")
print(f"Optimal bid fee: ${recommendations['best_fee']:.2f}")
print(f"Expected value: ${recommendations['best_ev']:.2f}")
print(f"Win probability at optimal fee: {recommendations['win_prob']:.1%}")

Saved regressor model

Saved model artifacts to models/bid_recommendation_artifacts.joblib

Example usage & validation:

Testing with ZIP code: 64503


ValueError: Missing features: {'roll_4w', 'lag_2', 'lag_1', 'week_win_rate', 'lag_3', 'lag_4'}

In [None]:
# Model Training with Error Handling

def safe_train_model(X, y, model_type='regressor'):
    """Train a model with proper error handling and validation"""
    
    # 1. Validate inputs
    if X is None or y is None:
        raise ValueError("X and y must not be None")
    
    if len(X) != len(y):
        raise ValueError(f"X and y must have same length. Got X: {len(X)}, y: {len(y)}")
    
    # 2. Clean target variable
    y = pd.Series(y)
    y = y.replace([np.inf, -np.inf], np.nan)
    y = y.fillna(y.median())
    
    # 3. Clean features
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())
    
    # 4. Create model
    if model_type == 'regressor':
        model = XGBRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            random_state=42,
            n_jobs=-1
        )
    else:
        model = XGBClassifier(
            n_estimators=200,
            learning_rate=0.08,
            max_depth=4,
            scale_pos_weight=1.0,
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1
        )
    
    # 5. Fit model with error handling
    try:
        model.fit(X, y)
        return model
    except Exception as e:
        print(f"Error training {model_type}: {str(e)}")
        return None

# Train regressor with time-series CV
print("\nTraining Regressor...")

try:
    # Clean target variable
    y_reg = y_reg.replace([np.inf, -np.inf], np.nan)
    y_reg = y_reg.fillna(y_reg.median())
    
    tscv = TimeSeriesSplit(n_splits=5)
    rmse_list = []
    mae_list = []
    
    for fold, (tr, te) in enumerate(tscv.split(X), 1):
        try:
            Xtr, Xte = X.iloc[tr], X.iloc[te]
            ytr, yte = y_reg.iloc[tr], y_reg.iloc[te]
            
            model = safe_train_model(Xtr, ytr, 'regressor')
            if model is None:
                continue
                
            preds = model.predict(Xte)
            rmse = mean_squared_error(yte, preds, squared=False)
            mae = mean_absolute_error(yte, preds)
            rmse_list.append(rmse)
            mae_list.append(mae)
            print(f'Fold {fold}: RMSE={rmse:.3f}, MAE={mae:.3f}')
        
        except Exception as e:
            print(f"Error in fold {fold}: {str(e)}")
            continue
    
    if rmse_list:
        print(f'\nAverage RMSE: {np.mean(rmse_list):.3f}')
        print(f'Average MAE: {np.mean(mae_list):.3f}')
    else:
        print("No successful folds")
        
    # Train final regressor on all data
    print("\nTraining final model...")
    model_full = safe_train_model(X, y_reg, 'regressor')
    if model_full is not None:
        globals()['model_full'] = model_full
        print("Final regressor trained successfully")
    else:
        print("Failed to train final regressor")
        
except Exception as e:
    print(f"Error in regressor training: {str(e)}")
    
# Train classifier if we have labels
if y_clf is not None and not y_clf.isna().all():
    print("\nTraining Classifier...")
    
    try:
        X_clf = X.copy()
        y_clf = y_clf.fillna(0)  # Assume no win for missing values
        
        try:
            Xtr, Xte, ytr, yte = train_test_split(
                X_clf, y_clf,
                test_size=0.2,
                random_state=42,
                stratify=y_clf
            )
        except Exception as e:
            print("Stratified split failed, using random split:", e)
            Xtr, Xte, ytr, yte = train_test_split(
                X_clf, y_clf,
                test_size=0.2,
                random_state=42
            )
        
        clf = safe_train_model(Xtr, ytr, 'classifier')
        if clf is not None:
            yproba = clf.predict_proba(Xte)[:,1]
            try:
                auc = roc_auc_score(yte, yproba)
                print(f'ROC-AUC Score: {auc:.3f}')
            except Exception as e:
                print("Could not compute AUC:", e)
            
            globals()['clf'] = clf
            print("Classifier trained successfully")
        else:
            print("Failed to train classifier")
            globals()['clf'] = None
            
    except Exception as e:
        print(f"Error in classifier training: {str(e)}")
        globals()['clf'] = None
else:
    print("\nNo labels for classifier training")
    globals()['clf'] = None


Training Regressor...


XGBoostError: [12:33:39] C:\actions-runner\_work\xgboost\xgboost\src\data\data.cc:550: Check failed: valid: Label contains NaN, infinity or a value too large.

In [None]:
# Production Setup: Save models and create inference wrapper
import joblib
import os

try:
    # Create models directory if it doesn't exist
    os.makedirs('models', exist_ok=True)
    
    # Validate models exist
    if 'model_full' not in globals() or model_full is None:
        raise ValueError("Regressor model not available")
        
    # Save regressor model
    joblib.dump(model_full, 'models/regressor.joblib')
    print("Saved regressor model")
    
    # Save classifier if available
    if 'clf' in globals() and clf is not None:
        joblib.dump(clf, 'models/classifier.joblib')
        print("Saved classifier model")
        fallback_winprob = 0.5
    else:
        fallback_winprob = y_clf.mean() if y_clf is not None else 0.5
    
    # Save artifacts for inference
    artifacts = {
        'feature_names': list(X.columns),
        'target_reg': target_reg,
        'has_classifier': 'clf' in globals() and clf is not None,
        'fallback_winprob': fallback_winprob,
        'categorical_columns': cat_cols,
        'encoders': encoders,
        'feature_statistics': {
            col: {'median': X[col].median()} 
            for col in X.columns
        }
    }
    joblib.dump(artifacts, 'models/bid_recommendation_artifacts.joblib')
    print("\nSaved model artifacts")
    
    class BidPrediction:
        """Production inference class with robust error handling"""
        
        def __init__(self, models_dir='models'):
            """Load models and artifacts"""
            try:
                # Load regressor
                self.regressor = joblib.load(f'{models_dir}/regressor.joblib')
                
                # Try to load classifier
                try:
                    self.classifier = joblib.load(f'{models_dir}/classifier.joblib')
                    self.has_classifier = True
                except:
                    self.classifier = None
                    self.has_classifier = False
                
                # Load artifacts
                self.artifacts = joblib.load(f'{models_dir}/bid_recommendation_artifacts.joblib')
                self.feature_names = self.artifacts['feature_names']
                self.target_reg = self.artifacts['target_reg']
                self.fallback_winprob = self.artifacts['fallback_winprob']
                self.encoders = self.artifacts['encoders']
                self.feature_stats = self.artifacts['feature_statistics']
                
            except Exception as e:
                raise RuntimeError(f"Error initializing BidPrediction: {str(e)}")
        
        def preprocess_features(self, data):
            """Preprocess features with error handling"""
            try:
                # Validate required columns
                missing_cols = set(self.artifacts['categorical_columns']) - set(data.columns)
                if missing_cols:
                    raise ValueError(f"Missing required columns: {missing_cols}")
                
                # Create feature matrix
                X_new = data.copy()
                
                # Encode categorical columns
                for col, encoder in self.encoders.items():
                    if col in data.columns:
                        X_new[f'{col}_encoded'] = encoder.transform(data[col].astype(str))
                
                # Select features in correct order
                X_new = X_new[self.feature_names]
                
                # Handle missing values using saved statistics
                for col in X_new.columns:
                    if X_new[col].isna().any():
                        X_new[col] = X_new[col].fillna(self.feature_stats[col]['median'])
                
                return X_new
                
            except Exception as e:
                raise ValueError(f"Error preprocessing features: {str(e)}")
        
        def predict(self, data):
            """
            Make prediction for new data with comprehensive error handling
            
            Args:
                data: pandas DataFrame with required features
                
            Returns:
                dict with predictions and metadata
            """
            try:
                # Validate input
                if not isinstance(data, pd.DataFrame):
                    raise ValueError("Input must be a pandas DataFrame")
                
                # Preprocess features
                X_new = self.preprocess_features(data)
                
                # Make predictions
                bid_pred = float(self.regressor.predict(X_new)[0])
                result = {
                    'predicted_bid': bid_pred,
                    'features_used': list(X_new.columns)
                }
                
                # Add win probability if classifier exists
                if self.has_classifier:
                    win_prob = float(self.classifier.predict_proba(X_new)[:,1][0])
                    result['win_probability'] = win_prob
                else:
                    result['win_probability'] = self.fallback_winprob
                    result['win_probability_note'] = 'Using fallback probability'
                
                return result
                
            except Exception as e:
                raise RuntimeError(f"Error making prediction: {str(e)}")
    
    # Test inference wrapper
    print("\nTesting inference wrapper...")
    predictor = BidPrediction()
    
    # Create test data
    test_data = df.iloc[[0]]
    print("\nTest data shape:", test_data.shape)
    
    # Make prediction
    prediction = predictor.predict(test_data)
    print("\nTest prediction successful:")
    for k, v in prediction.items():
        print(f"{k}: {v}")
        
except Exception as e:
    print(f"Error in production setup: {str(e)}")
    raise

In [None]:
# Model Interpretability with SHAP values
import shap

print("Calculating SHAP values for regressor...")
explainer = shap.TreeExplainer(model_full)
shap_values = explainer.shap_values(X)

# Plot feature importance
plt.figure(figsize=(12,6))
shap.summary_plot(shap_values, X, show=False)
plt.title('Feature Importance (SHAP)')
plt.tight_layout()
plt.show()

# Plot detailed SHAP values for first prediction
plt.figure(figsize=(12,6))
shap.force_plot(
    explainer.expected_value, 
    shap_values[0,:],
    X.iloc[0,:],
    matplotlib=True,
    show=False
)
plt.title('SHAP Force Plot for First Prediction')
plt.tight_layout()
plt.show()

if 'clf' in globals():
    print("\nCalculating SHAP values for classifier...")
    explainer_clf = shap.TreeExplainer(clf)
    shap_values_clf = explainer_clf.shap_values(X)
    
    plt.figure(figsize=(12,6))
    shap.summary_plot(shap_values_clf[1], X, show=False)
    plt.title('Feature Importance for Win Probability (SHAP)')
    plt.tight_layout()
    plt.show()

In [None]:
# Final Validation

def validate_production_setup():
    """Comprehensive validation of the production setup"""
    try:
        print("Running final validation...\n")
        checks = []
        
        # 1. Check models directory exists
        models_dir_exists = os.path.exists('models')
        checks.append(('Models directory exists', models_dir_exists))
        
        # 2. Check model files exist
        model_files = {
            'regressor.joblib': False,
            'classifier.joblib': False,
            'bid_recommendation_artifacts.joblib': False
        }
        for file in model_files:
            model_files[file] = os.path.exists(os.path.join('models', file))
            checks.append((f'{file} exists', model_files[file]))
        
        # 3. Test model loading
        try:
            predictor = BidPrediction()
            checks.append(('Model loading', True))
        except Exception as e:
            checks.append(('Model loading', False))
            print(f"Model loading error: {str(e)}")
        
        # 4. Test inference
        if df is not None and len(df) > 0:
            try:
                test_data = df.iloc[[0]]
                prediction = predictor.predict(test_data)
                checks.append(('Inference test', True))
                checks.append(('Prediction contains bid', 'predicted_bid' in prediction))
                checks.append(('Prediction contains probability', 'win_probability' in prediction))
            except Exception as e:
                checks.append(('Inference test', False))
                print(f"Inference error: {str(e)}")
        
        # Print results
        print("Validation Results:")
        for check, status in checks:
            print(f"{check}: {'✓' if status else '✗'}")
        
        # Overall status
        success = all(status for _, status in checks)
        print(f"\nOverall validation: {'Passed' if success else 'Failed'}")
        
        return success
        
    except Exception as e:
        print(f"Validation error: {str(e)}")
        return False

# Run validation
validation_success = validate_production_setup()

if validation_success:
    print("\nModel is production-ready! ✨")
    print("\nUsage example:")
    print("```python")
    print("from bid_prediction import BidPrediction")
    print("predictor = BidPrediction()")
    print("prediction = predictor.predict(new_data)")
    print("```")
else:
    print("\n⚠️ Model needs attention before production use")