In [72]:
%%writefile ../src/salary_model_training/data_loader_preprocessor.py

import os
import joblib
import pandas as pd
import numpy as np
import logging
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
INJURY_RISK_MAP = {
    'Low Risk': 1,
    'Moderate Risk': 2,
    'High Risk': 3
}

REVERSE_INJURY_RISK_MAP = {
    1: 'Low Risk',
    2: 'Moderate Risk',
    3: 'High Risk'
}

# Define feature groups
NUMERIC_FEATURES = ['Age', 'Years of Service', 'PER', 'TS%', 'ORB%', 'DRB%', 'TRB%', 'AST%', 
                    'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 
                    'PPG', 'APG', 'SPG', 'TPG', 'BPG', 'Availability', 
                    'Efficiency', 'Days_Injured_Percentage', 'ValueOverReplacement', 'ExperienceSquared']

ONE_HOT_ENCODE_CATEGORICAL_FEATURES = ['Position', 'Team']
LEAVE_ALONE_FEATURES = ['Season', 'Injury_Risk', 'SalaryPct'] #SalaryPct included so it's included in engineer data filter
PIPELINE_LEAVE_ALONE_FEATURES = ['Season', 'Injury_Risk'] #SalaryPct taken out because this goes through the pipeline. So it's included in engineer features, split into train/test and x/y datasets, then input through the pipeline where it shouldn't be used
columns_to_add_back_later = ['Season', 'Salary_Cap_Inflated', 'Player', 'SalaryPct']  

# Format Season Column
def format_season(data):
    """Converts the 'Season' column from 'YYYY-YY' to 'YYYY' format."""
    try:
        data['Season'] = data['Season'].apply(lambda x: int(x.split('-')[0]))
        logger.info(f"Seasons in data: {data['Season'].unique()}")
        logger.info(f"Shape after season formatting: {data.shape}")
        logger.info(f"Null values after season formatting:\n{data.isnull().sum()}")
        return data
    except Exception as e:
        logger.error(f"Failed to format season data: {e}")
        raise
    
def filter_seasons(data, predict_season):
    """Split the data into prior seasons (train) and the selected season (test)."""
    prior_seasons_data = data[data['Season'] < predict_season]
    target_season_data = data[data['Season'] == predict_season]
    
    logger.debug(f"Data filtered. Prior seasons shape: {prior_seasons_data.shape}, Target season shape: {target_season_data.shape}")
    logger.debug(f"Feature columns used for training: {prior_seasons_data.columns.tolist()}")

    return prior_seasons_data, target_season_data

# Get Feature Names from Pipeline
def get_feature_names(pipeline):
    """Extract feature names after applying transformations in the pipeline."""
    # Numeric feature names
    num_col_names = NUMERIC_FEATURES
    
    # Categorical feature names (after one-hot encoding)
    cat_col_names = pipeline.named_transformers_['cat']['onehot'].get_feature_names_out(ONE_HOT_ENCODE_CATEGORICAL_FEATURES)
    
    # Combine all column names: numeric, one-hot encoded, and passthrough (without 'SalaryPct')
    all_col_names = list(num_col_names) + list(cat_col_names) + PIPELINE_LEAVE_ALONE_FEATURES
    
    return all_col_names


# Label Encoding Injury Risk
def label_encode_injury_risk(data):
    """Encode Injury_Risk using predefined mapping."""
    logger.debug("Label encoding Injury_Risk...")
    logger.debug(f"First few Injury_Risk values before encoding:\n{data['Injury_Risk'].head()}")
    
    # Encode Injury_Risk
    data['Injury_Risk'] = data['Injury_Risk'].map(INJURY_RISK_MAP)
    logger.debug(f"First few Injury_Risk values after encoding:\n{data['Injury_Risk'].head()}")
    
    return data

def inverse_transform_injury_risk(data):
    """Inverse transform Injury_Risk using predefined reverse mapping."""
    logger.debug("Inverse transforming Injury_Risk...")
    logger.debug(f"First few Injury_Risk values before inverse transformation:\n{data['Injury_Risk'].head()}")

    # Inverse transform Injury_Risk
    data['Injury_Risk'] = data['Injury_Risk'].map(REVERSE_INJURY_RISK_MAP)
    logger.debug(f"First few Injury_Risk values after inverse transformation:\n{data['Injury_Risk'].head()}")
    
    return data



# Step 1: load and clean the data
def clean_data(file_path):
    """Load and clean data."""
    try:
        data = pd.read_csv(file_path)
        logger.info(f"Data loaded. Initial shape: {data.shape}")

        # Handle missing percentages and drop unnecessary columns
        data['3P%'] = np.where(data['3PA'] != 0, data['3P'] / data['3PA'], np.nan)
        data['FT%'] = np.where(data['FTA'] != 0, data['FT'] / data['FTA'], np.nan)
        data['2P%'] = np.where(data['2PA'] != 0, data['2P'] / data['2PA'], np.nan)
        data.drop(['3P%', 'FT%', '2P%'], axis=1, inplace=True)

        columns_to_remove = ['Salary Cap', 'Luxury Tax', '1st Apron', 'BAE', 'Standard /Non-Taxpayer', 
                             'Taxpayer', 'Team Room /Under Cap', 'Wins', 'Losses', '2nd Apron', 'Injury_Periods']
        data.drop(columns_to_remove, axis=1, inplace=True)

        # Filter out rows with nulls in advanced stats
        advanced_stats_columns = ['PER', 'TS%', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 
                                  'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
        data = data.dropna(subset=advanced_stats_columns)
        
        logger.info(f"Final shape after processing: {data.shape}")
        return data

    except Exception as e:
        logger.error(f"Data cleaning failed: {e}")
        raise

# Feature Engineering
def engineer_features(data):
    """Feature engineering step where new features are derived from existing ones."""
    per_game_cols = ['PTS', 'AST', 'TRB', 'STL', 'BLK', 'TOV']
    for col in per_game_cols:
        data[f'{col[0]}PG'] = data[col] / data['GP']
    
    data['Availability'] = data['GP'] / 82
    data['SalaryPct'] = data['Salary'] / data['Salary_Cap_Inflated']
    data['Efficiency'] = (data['PTS'] + data['TRB'] + data['AST'] + data['STL'] + data['BLK']) / (data['FGA'] + data['FTA'] + data['TOV'] + 1)
    data['ValueOverReplacement'] = data['VORP'] / data['GP'] 
    data['ExperienceSquared'] = data['Years of Service'] ** 2
    data['Days_Injured_Percentage'] = data['Total_Days_Injured'] / data['GP']

    engineered_data = data.copy()

    columns_to_keep_for_pipeline = NUMERIC_FEATURES + ONE_HOT_ENCODE_CATEGORICAL_FEATURES + LEAVE_ALONE_FEATURES
    pipeline_data = data[columns_to_keep_for_pipeline]
    columns_to_re_add = data[columns_to_add_back_later]
    
    return engineered_data, pipeline_data, columns_to_re_add

# After preprocessing, extract SalaryPct as the target (y)
def preprocessed_datasets(file_path):
    original_data = pd.read_csv(file_path)
    
    # Load and preprocess data
    cleaned_data = clean_data(file_path)
    cleaned_data = format_season(cleaned_data)
    
    # Get the pipeline data and columns to re-add
    engineered_data, pipeline_data, columns_to_re_add = engineer_features(cleaned_data)
    
    # Label encode the pipeline data
    pipeline_data = label_encode_injury_risk(pipeline_data)
    
    return cleaned_data, engineered_data, pipeline_data, columns_to_re_add

# Split the dataset into train and test sets based on the season
def filter_seasons(data, predict_season):
    """Split the data into prior seasons (train) and the selected season (test)."""
    prior_seasons_data = data[data['Season'] < predict_season]
    target_season_data = data[data['Season'] == predict_season]
    
    return prior_seasons_data, target_season_data

# Build the Pipeline
def build_pipeline():
    """Creates a data processing pipeline that applies encoding and scaling transformations."""
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, NUMERIC_FEATURES),
            ('cat', categorical_transformer, ONE_HOT_ENCODE_CATEGORICAL_FEATURES),
            ('passthrough', 'passthrough', PIPELINE_LEAVE_ALONE_FEATURES)  # Season and Injury_Risk passthrough
        ],
        remainder='drop'
    )
    
    return preprocessor


# Main execution
if __name__ == "__main__":
    try:
        file_path = '../data/processed/nba_player_data_final_inflated.csv'
        season = 2022
        original_data = pd.read_csv(file_path)

        # Step 1: Preprocess the dataset
        cleaned_data, engineered_data, pipeline_data, columns_to_re_add = preprocessed_datasets(file_path)

        # Step 2: Split data into train and test sets based on season
        train_data, test_data = filter_seasons(pipeline_data, season)
        print("days injured unique values = ", train_data['Days_Injured_Percentage'].unique())
        print("days injured unique values = ", test_data['Days_Injured_Percentage'].unique())
        # Step 3: Separate features (X) and target (y)
        X_train = train_data.drop('SalaryPct', axis=1)
        y_train = train_data['SalaryPct']
        X_test = test_data.drop('SalaryPct', axis=1)
        y_test = test_data['SalaryPct']

        # Step 4: Build and apply the pipeline
        pipeline = build_pipeline()
        # Before and after pipeline debug
        logger.debug(f"Before pipeline transformation: {X_train.columns.tolist()}")
        X_train_transformed = pipeline.fit_transform(X_train)
        logger.debug(f"After pipeline transformation: {X_train_transformed.shape}")
        logger.debug(f"Transformed feature names: {pipeline.get_feature_names_out()}")
        print("Sample of transformed data:", X_train_transformed[:5])


        # Save the fitted pipeline
        joblib.dump(pipeline, f'../data/models/season_{season}/preprocessing_pipeline.pkl')
        
        columns_to_re_add_train_data, columns_to_re_add_test_data = filter_seasons(columns_to_re_add, season)
        columns_to_re_add_train_data = columns_to_re_add_train_data.drop('Season', axis=1)
        columns_to_re_add = columns_to_re_add_test_data.drop('Season', axis=1)
        print("columns_to_re_add =", columns_to_re_add)
        # Save columns to re-add later
        joblib.dump(columns_to_re_add, f'../data/models/season_{season}/columns_to_re_add.pkl')

        # all_col_names = get_feature_names(pipeline)
        # print("all column names = ", all_col_names)
        # joblib.dump(all_col_names, f'../data/models/season_{season}/feature_names.pkl')


    except Exception as e:
        logger.critical(f"Critical error in data processing pipeline: {e}")
        raise


Overwriting ../src/salary_model_training/data_loader_preprocessor.py


In [73]:
%%writefile ../src/salary_model_training/model_trainer.py
import joblib
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np
import os
import logging
from .data_loader_preprocessor import preprocessed_datasets, build_pipeline, filter_seasons, get_feature_names

# Set up logging for debugging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def perform_grid_search(model, param_grid, X_train, y_train):
    """Performs grid search for hyperparameter tuning."""
    logger.debug(f"Starting grid search for {model.__class__.__name__}. Parameters: {param_grid}")
    logger.debug(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1)
    grid_search.fit(X_train, y_train)
    logger.info(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    logger.info(f"Best score for {model.__class__.__name__}: {-grid_search.best_score_}")
    return grid_search.best_estimator_

def train_and_save_models(X_train, y_train, model_save_path):
    """Train models and save them along with preprocessing pipeline."""
    logger.debug(f"Starting model training. Model save path: {model_save_path}")
    logger.debug(f"Training data shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
    
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    rf_model = RandomForestRegressor(random_state=42)
    xgb_model = xgb.XGBRegressor(random_state=42, enable_categorical=False)

    rf_param_grid = {'n_estimators': [100, 200], 'max_depth': [None, 10], 'min_samples_split': [2, 5]}
    xgb_param_grid = {'n_estimators': [50, 100], 'max_depth': [3], 'learning_rate': [0.01]}

    logger.debug(f"Performing grid search for RandomForestRegressor.")
    best_rf_model = perform_grid_search(rf_model, rf_param_grid, X_train, y_train)
    
    logger.debug(f"Performing grid search for XGBoostRegressor.")
    best_xgb_model = perform_grid_search(xgb_model, xgb_param_grid, X_train, y_train)

    # Save the models
    joblib.dump(best_rf_model, os.path.join(model_save_path, 'best_rf_model.pkl'))
    joblib.dump(best_xgb_model, os.path.join(model_save_path, 'best_xgb_model.pkl'))
    logger.info(f"Models saved in {model_save_path}.")

def evaluate_models(X_test, y_test, model_save_path):
    """Evaluate models on the test set and save predictions."""
    rf_model = joblib.load(f"{model_save_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{model_save_path}/best_xgb_model.pkl")

    logger.debug(f"Loaded models for evaluation.")
    logger.debug(f"Evaluating on test data. X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    rf_predictions = rf_model.predict(X_test)
    xgb_predictions = xgb_model.predict(X_test)

    rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
    rf_mae = mean_absolute_error(y_test, rf_predictions)
    rf_r2 = r2_score(y_test, rf_predictions)
    rf_mse = mean_squared_error(y_test, rf_predictions)

    xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_predictions))
    xgb_mae = mean_absolute_error(y_test, xgb_predictions)
    xgb_r2 = r2_score(y_test, xgb_predictions)
    xgb_mse = mean_squared_error(y_test, xgb_predictions)

    logger.info(f"\nRandom Forest RMSE: {rf_rmse}")
    logger.info(f"Random Forest MAE: {rf_mae}")
    logger.info(f"Random Forest R²: {rf_r2}")
    logger.info(f"Random Forest MSE: {rf_mse}")

    logger.info(f"\nXGBoost RMSE: {xgb_rmse}")
    logger.info(f"XGBoost MAE: {xgb_mae}")
    logger.info(f"XGBoost R²: {xgb_r2}")
    logger.info(f"XGBoost MSE: {xgb_mse}")

    eval_results = {
        'rf_predictions': rf_predictions,
        'xgb_predictions': xgb_predictions,
        'rf_rmse': rf_rmse,
        'rf_mae': rf_mae,
        'rf_r2': rf_r2,
        'rf_mse': rf_mse,
        'xgb_rmse': xgb_rmse,
        'xgb_mae': xgb_mae,
        'xgb_r2': xgb_r2,
        'xgb_mse': xgb_mse
    }

    eval_save_path = f"{model_save_path}/evaluation_results.pkl"
    joblib.dump(eval_results, eval_save_path)
    logger.info(f"Evaluation results saved at {eval_save_path}")
    
    return eval_results

def load_and_preprocess_data(file_path, predict_season, model_save_path):
    """Load data, filter by seasons, and apply preprocessing pipeline."""
    logger.debug(f"Loading data and preprocessing for season {predict_season}")
    
    # Step 1: Preprocess the dataset
    cleaned_data, engineered_data, pipeline_data, columns_to_re_add = preprocessed_datasets(file_path)
    
    # Step 2: Split data into train and test sets based on season
    train_data, test_data = filter_seasons(pipeline_data, predict_season)

    # Step 3: Separate features (X) and target (y)
    X_train = train_data.drop('SalaryPct', axis=1)
    y_train = train_data['SalaryPct']
    X_test = test_data.drop('SalaryPct', axis=1)
    y_test = test_data['SalaryPct']

    # Step 4: Build and apply the pipeline
    pipeline = build_pipeline()

    X_train_transformed = pipeline.fit_transform(X_train)
    X_test_transformed = pipeline.transform(X_test)

    # Save the fitted pipeline
    joblib.dump(pipeline, os.path.join(model_save_path, 'preprocessing_pipeline.pkl'))

    # Save the columns to re-add later
    joblib.dump(columns_to_re_add, os.path.join(model_save_path, 'columns_to_re_add.pkl'))

    # Save Features
    all_col_names = get_feature_names(pipeline)
    print("all column names = ", all_col_names)
    joblib.dump(all_col_names, os.path.join(model_save_path, 'feature_names.pkl'))
    
    return X_train_transformed, X_test_transformed, y_train, y_test

# Model Training Pipeline
if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    predict_season = 2022
    model_save_path = f'../data/models/season_{predict_season}'

    
    logger.debug(f"Starting the pipeline for season {predict_season} with file: {file_path}")
    
    # Load and preprocess the data
    X_train, X_test, y_train, y_test = load_and_preprocess_data(file_path, predict_season, model_save_path)
    
    # Train and save models
    train_and_save_models(X_train, y_train, model_save_path)
    
    # Evaluate models on the test set
    evaluated_models = evaluate_models(X_test, y_test, model_save_path)
    print("metrics = ", evaluated_models)


Overwriting ../src/salary_model_training/model_trainer.py


In [74]:
%%writefile ../src/salary_model_training/model_predictor.py


import pandas as pd
import joblib
import numpy as np  # Add this import for numpy
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgb
import logging

# Add relative imports for functions and constants
from .data_loader_preprocessor import preprocessed_datasets, filter_seasons, inverse_transform_injury_risk
from .model_trainer import load_and_preprocess_data, train_and_save_models, evaluate_models  # Add this line


# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
INJURY_RISK_MAP = {
    'Low Risk': 1,
    'Moderate Risk': 2,
    'High Risk': 3
}

REVERSE_INJURY_RISK_MAP = {
    1: 'Low Risk',
    2: 'Moderate Risk',
    3: 'High Risk'
}

# Define feature groups
NUMERIC_FEATURES = ['Age', 'Years of Service', 'PER', 'TS%', 'ORB%', 'DRB%', 'TRB%', 'AST%', 
                    'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 
                    'PPG', 'APG', 'SPG', 'TPG', 'BPG', 'Availability', 
                    'Efficiency', 'Days_Injured_Percentage', 'ValueOverReplacement', 'ExperienceSquared']

ONE_HOT_ENCODE_CATEGORICAL_FEATURES = ['Position', 'Team']
LEAVE_ALONE_FEATURES = ['Season', 'Injury_Risk', 'SalaryPct'] 
PIPELINE_LEAVE_ALONE_FEATURES = ['Season', 'Injury_Risk'] 
columns_to_add_back_later = ['Season', 'Salary_Cap_Inflated', 'Player', 'SalaryPct']  

CATEGORICAL_FEATURES = ['Position', 'Team']
PASSTHROUGH_FEATURES = ['Season', 'Injury_Risk']

def load_models_and_pipeline(model_save_path, predict_season):
    season_model_path = model_save_path
    logger.debug(f"Loading models and pipeline for season {predict_season} from {season_model_path}")

    rf_model = joblib.load(f"{season_model_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{season_model_path}/best_xgb_model.pkl")
    pipeline = joblib.load(f"{season_model_path}/preprocessing_pipeline.pkl")
    columns_to_re_add = joblib.load(f"{season_model_path}/columns_to_re_add.pkl")
    feature_names = joblib.load(f"{season_model_path}/feature_names.pkl")
    
    return rf_model, xgb_model, pipeline, columns_to_re_add, feature_names

def load_and_preprocess_test_data(file_path, predict_season, model_save_path):
    logger.debug("Loading and preprocessing test data...")
    cleaned_data, engineered_data, pipeline_data, _ = preprocessed_datasets(file_path)

    # Filter data for the prediction season
    _, test_data = filter_seasons(pipeline_data, predict_season)

    X_test = test_data.drop('SalaryPct', axis=1)
    y_test = test_data['SalaryPct']

    # Debugging the columns before transformation
    logger.debug(f"Test data shape before transformation: {X_test.shape}")
    logger.debug(f"Test data columns before transformation: {X_test.columns.tolist()}")

    # Check unique values for Injury_Risk and Total_Days_Injured before transformation
    logger.debug(f"Unique values of 'Injury_Risk' before transformation: {X_test['Injury_Risk'].unique()}")
    logger.debug(f"Unique values of 'Days_Injured_Percentage' before transformation: {test_data['Days_Injured_Percentage'].unique()}")

    rf_model, xgb_model, pipeline, columns_to_re_add, feature_names = load_models_and_pipeline(model_save_path, predict_season)

    # Separate and log the numerical columns
    X_test_numeric = X_test[NUMERIC_FEATURES]
    logger.debug(f"Numerical data before transformation (shape): {X_test_numeric.shape}")
    logger.debug(f"Numerical data before transformation (columns): {X_test_numeric.columns.tolist()}")
    logger.debug(f"Sample of numerical data: {X_test_numeric.head()}")

    # Transform the data
    X_test_transformed = pipeline.transform(X_test)

    # Debugging the transformed numerical data
    numeric_transformer = pipeline.named_transformers_['num']['scaler']
    transformed_numeric = numeric_transformer.transform(X_test_numeric)
    logger.debug(f"Transformed numerical data shape: {transformed_numeric.shape}")
    logger.debug(f"Sample of transformed numerical data: {transformed_numeric[:5]}")

    # Check passthrough Injury_Risk after transformation (since it's not transformed)
    logger.debug(f"Unique values of 'Injury_Risk' after transformation (passthrough): {X_test['Injury_Risk'].unique()}")

    # Check Total_Days_Injured after transformation (it should be included in the numeric transformations)
    logger.debug(f"Transformed 'Total_Days_Injured' values (numeric feature): {transformed_numeric[:, NUMERIC_FEATURES.index('Days_Injured_Percentage')][:5]}")

    # Debug the final transformed data shape
    logger.debug(f"Shape of transformed data: {X_test_transformed.shape}")
    
    return X_test, X_test_transformed, y_test, columns_to_re_add, feature_names, pipeline

def inverse_transform_and_add_context(rf_predictions, xgb_predictions, X_test, X_test_transformed, columns_to_re_add, feature_names, pipeline):
    logger.debug(f"Shape of rf_predictions: {rf_predictions.shape}")

    # Convert predictions to DataFrame
    rf_predictions_df = pd.DataFrame(rf_predictions, columns=['Predicted_SalaryPct'], index=X_test.index)
    xgb_predictions_df = pd.DataFrame(xgb_predictions, columns=['Predicted_SalaryPct'], index=X_test.index)

    # Inverse transform numerical features
    numeric_transformer = pipeline.named_transformers_['num']['scaler']
    X_test_numeric = X_test[NUMERIC_FEATURES]
    X_test_numeric_inverse = pd.DataFrame(
        numeric_transformer.inverse_transform(X_test_transformed[:, :len(NUMERIC_FEATURES)]),
        columns=NUMERIC_FEATURES,
        index=X_test.index
    )

    # Inverse transform categorical features
    categorical_transformer = pipeline.named_transformers_['cat']['onehot']
    transformed_cat_indices = slice(len(NUMERIC_FEATURES), -len(PASSTHROUGH_FEATURES))  # Indices of categorical features
    X_test_categorical_inverse = pd.DataFrame(
        categorical_transformer.inverse_transform(X_test_transformed[:, transformed_cat_indices]),
        columns=ONE_HOT_ENCODE_CATEGORICAL_FEATURES,
        index=X_test.index
    )

    # Handle passthrough features (Season, Injury_Risk) directly
    X_test_passthrough = X_test[PASSTHROUGH_FEATURES]

    # Concatenate inverse-transformed numeric, categorical, and passthrough columns
    X_test_inverse_transformed = pd.concat([X_test_numeric_inverse, X_test_categorical_inverse, X_test_passthrough], axis=1)

    # Re-add the context columns (e.g., Salary_Cap_Inflated, Total_Days_Injured)
    context_columns_df = pd.DataFrame(columns_to_re_add, index=X_test.index)

    # Inverse transform the Injury_Risk column back to original categories
    X_test_inverse_transformed = inverse_transform_injury_risk(X_test_inverse_transformed)

    # Final prediction DataFrames
    final_rf_df = pd.concat([X_test_inverse_transformed, context_columns_df, rf_predictions_df], axis=1)
    final_xgb_df = pd.concat([X_test_inverse_transformed, context_columns_df, xgb_predictions_df], axis=1)

    # Add Predicted_Salary column (in millions) by multiplying Predicted_SalaryPct with Salary_Cap_Inflated
    final_rf_df['Predicted_Salary'] = (final_rf_df['Predicted_SalaryPct'] * final_rf_df['Salary_Cap_Inflated'] / 1_000_000).round(2)
    final_xgb_df['Predicted_Salary'] = (final_xgb_df['Predicted_SalaryPct'] * final_xgb_df['Salary_Cap_Inflated'] / 1_000_000).round(2)

    # Add Predicted_Salary column (in millions) by multiplying Predicted_SalaryPct with Salary_Cap_Inflated
    final_rf_df['Salary'] = (final_rf_df['SalaryPct'] * final_rf_df['Salary_Cap_Inflated'] / 1_000_000).round(2)
    final_xgb_df['Salary'] = (final_xgb_df['SalaryPct'] * final_xgb_df['Salary_Cap_Inflated'] / 1_000_000).round(2)

    
    return final_rf_df, final_xgb_df


def save_predictions(final_rf_df, final_xgb_df, model_save_path):
    rf_save_path = f"{model_save_path}/rf_predictions.csv"
    xgb_save_path = f"{model_save_path}/xgb_predictions.csv"

    final_rf_df.to_csv(rf_save_path, index=False)
    final_xgb_df.to_csv(xgb_save_path, index=False)

    logger.info(f"Predictions saved to {rf_save_path} and {xgb_save_path}")


# Main function to run the prediction pipeline
def make_predictions(file_path, predict_season, model_save_path):
    logger.debug("Starting prediction pipeline...")

    # Step 1: Load and preprocess test data
    X_test, X_test_transformed, y_test, columns_to_re_add, feature_names, pipeline = load_and_preprocess_test_data(file_path, predict_season, model_save_path)

    logger.debug(f"Shape of X_test_transformed before predictions: {X_test_transformed.shape}")

    # Step 2: Load models
    rf_model, xgb_model, _, _, _ = load_models_and_pipeline(model_save_path, predict_season)

    # Step 3: Make predictions
    rf_predictions = rf_model.predict(X_test_transformed)
    xgb_predictions = xgb_model.predict(X_test_transformed)

    logger.debug(f"RF Predictions: {rf_predictions[:5]}")
    logger.debug(f"XGB Predictions: {xgb_predictions[:5]}")

    # Step 4: Inverse transform and add context
    final_rf_df, final_xgb_df = inverse_transform_and_add_context(rf_predictions, xgb_predictions, X_test, X_test_transformed, columns_to_re_add, feature_names, pipeline)

    # Drop one of the duplicate 'Season' columns
    if 'Season' in final_rf_df.columns:
        final_rf_df = final_rf_df.loc[:, ~final_rf_df.columns.duplicated()]
    
    if 'Season' in final_xgb_df.columns:
        final_xgb_df = final_xgb_df.loc[:, ~final_xgb_df.columns.duplicated()]

    # Step 5: Save predictions
    save_predictions(final_rf_df, final_xgb_df, model_save_path)

    return final_rf_df, final_xgb_df

if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    predict_season = 2022
    model_save_path = f'../data/models/season_{predict_season}'

    rf_final_df, xgb_final_df = make_predictions(file_path, predict_season, model_save_path)
    print(rf_final_df)
    print(rf_final_df.columns)


Overwriting ../src/salary_model_training/model_predictor.py


In [75]:
%%writefile ../src/salary_model_training/util_functions.py

import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from .data_loader_preprocessor import preprocessed_datasets
from .model_trainer import load_and_preprocess_data, train_and_save_models, evaluate_models  # Add this line
from .model_predictor import make_predictions
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

CATEGORICAL_FEATURES = ['Position_', 'Team_']

def load_evaluation_metrics(model_save_path):
    """Load saved evaluation metrics."""
    eval_save_path = f"{model_save_path}/evaluation_results.pkl"
    if os.path.exists(eval_save_path):
        eval_results = joblib.load(eval_save_path)
        return eval_results
    else:
        print("Evaluation results not found.")
        return None

def check_or_train_model(file_path, model_save_path, season_year):
    logger.debug(f"Received season_year: {season_year}, Type: {type(season_year)}")

    # Convert season_year to an integer if it's a string
    if isinstance(season_year, str):
        season_year = int(season_year)
        logger.debug(f"Converted season_year to int: {season_year}, Type: {type(season_year)}")

    predictions_file_path = f'{model_save_path}/rf_predictions.csv'
    if os.path.exists(predictions_file_path):
        logger.debug(f"Predictions file found for {season_year}.")
        predictions_df = pd.read_csv(predictions_file_path)
    else:
        logger.debug(f"Predictions not available for {season_year}. Training the model now...")
        
        # Train and predict
        X_train, X_test, y_train, y_test = load_and_preprocess_data(file_path, season_year, model_save_path)
        train_and_save_models(X_train, y_train, model_save_path)
        evaluate_models(X_test, y_test, model_save_path)

        # Generate predictions
        rf_final_df, xgb_final_df = make_predictions(file_path, season_year, model_save_path)
        predictions_df = pd.concat([rf_final_df, xgb_final_df], axis=1)

    return predictions_df



def display_model_metrics(model_save_path):
    """Display saved model performance metrics for both Random Forest and XGBoost."""
    eval_results = load_evaluation_metrics(model_save_path)

    if eval_results:
        print("\nModel Performance Metrics:")
        print(f"Random Forest RMSE: {eval_results['rf_rmse']:.4f}")
        print(f"Random Forest MAE: {eval_results['rf_mae']:.4f}")
        print(f"Random Forest R²: {eval_results['rf_r2']:.4f}")
        print(f"Random Forest MSE: {eval_results['rf_mse']:.4f}")
        
        print(f"\nXGBoost RMSE: {eval_results['xgb_rmse']:.4f}")
        print(f"XGBoost MAE: {eval_results['xgb_mae']:.4f}")
        print(f"XGBoost R²: {eval_results['xgb_r2']:.4f}")
        print(f"XGBoost MSE: {eval_results['xgb_mse']:.4f}")
    else:
        print("No evaluation metrics found.")

def filter_categorical_features(importance_df, categorical_features):
    """Filter out categorical features from the importance dataframe."""
    filtered_df = importance_df[~importance_df['Feature'].str.startswith(tuple(categorical_features))]
    return filtered_df

def display_feature_importance(model, feature_names, categorical_features):
    """Displays feature importance for the selected model, filtering out categorical features."""
    if hasattr(model, "feature_importances_"):
        n_features = len(model.feature_importances_)
        print(f"Number of features in model: {n_features}")
        
        # Create the DataFrame of feature importances
        importance_df = pd.DataFrame({
            'Feature': feature_names[:n_features],  # Adjust if feature names mismatch
            'Importance': model.feature_importances_
        }).sort_values(by="Importance", ascending=False)
        
        # Filter out categorical features
        filtered_importance_df = filter_categorical_features(importance_df, categorical_features)
        return filtered_importance_df
    else:
        print("This model does not support feature importance visualization.")
        return None

def plot_feature_importance(feature_importances_df, model_name):
    """Function to plot the feature importance as a bar chart."""
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importances_df['Feature'], feature_importances_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.title(f'{model_name} Feature Importance')
    plt.gca().invert_yaxis()  # Most important feature at the top
    return plt

def identify_overpaid_underpaid(predictions_df, top_n=10):
    logger.debug(f"Checking Salary and Predicted_Salary columns")
    
    # Check for duplicate columns and remove them
    predictions_df = predictions_df.loc[:, ~predictions_df.columns.duplicated()]
    
    logger.debug(f"Salary type: {type(predictions_df['Salary'].iloc[0])}, Predicted_Salary type: {type(predictions_df['Predicted_Salary'].iloc[0])}")
    logger.debug(f"First few Salary values: {predictions_df['Salary'].head()}")
    logger.debug(f"First few Predicted_Salary values: {predictions_df['Predicted_Salary'].head()}")
    
    # Calculate salary differences
    predictions_df['Salary_Difference'] = predictions_df['Salary'] - predictions_df['Predicted_Salary']
    
    # Identify overpaid and underpaid players
    overpaid = predictions_df[predictions_df['Salary_Difference'] > 0].sort_values('Salary_Difference', ascending=False).head(top_n)
    underpaid = predictions_df[predictions_df['Salary_Difference'] < 0].sort_values('Salary_Difference').head(top_n)
    
    logger.debug(f"Top overpaid: {overpaid[['Player', 'Salary', 'Predicted_Salary', 'Salary_Difference']].head()}")
    logger.debug(f"Top underpaid: {underpaid[['Player', 'Salary', 'Predicted_Salary', 'Salary_Difference']].head()}")
    
    return overpaid, underpaid



def display_overpaid_underpaid(predictions_df, top_n=10):
    """Display top overpaid and underpaid players."""
    overpaid, underpaid = identify_overpaid_underpaid(predictions_df, top_n)

    print(f"\nTop {top_n} Overpaid Players:")
    print(overpaid[['Player', 'Team', 'Salary', 'Predicted_Salary', 'Salary_Difference']])

    print(f"\nTop {top_n} Underpaid Players:")
    print(underpaid[['Player', 'Team', 'Salary', 'Predicted_Salary', 'Salary_Difference']])




import seaborn as sns
import matplotlib.pyplot as plt

def plot_feature_distribution(data, feature):
    """Plot the distribution of a selected feature."""
    logger.debug(f"Plotting distribution for feature: {feature}")
    fig, ax = plt.subplots()
    data[feature].hist(ax=ax, bins=20)
    ax.set_title(f"Distribution of {feature}")
    ax.set_xlabel(feature)
    ax.set_ylabel("Frequency")
    return fig


def plot_correlation_heatmap(data):
    """Plot a correlation heatmap of the numerical features in the dataset."""
    logger.debug("Plotting correlation heatmap for numeric features.")
    numeric_data = data.select_dtypes(include=[np.number])
    corr = numeric_data.corr()
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(corr, annot=False, cmap='coolwarm', ax=ax)
    ax.set_title('Correlation Heatmap')
    return fig

def test_data_analysis_functions():
    """Test the data analysis utility functions."""
    # Load some test data
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    cleaned_data, engineered_data, pipeline_data, columns_to_re_add = preprocessed_datasets(file_path)

    # Select a feature to test the distribution plot
    feature = 'SalaryPct'  # Choose a numerical feature available in your dataset
    logger.debug(f"Testing feature distribution for: {feature}")
    
    # Test the feature distribution function
    fig = plot_feature_distribution(pipeline_data, feature)
    fig.show()  # Show the plot to ensure it's working correctly

    # Test the correlation heatmap function
    logger.debug("Testing correlation heatmap plot.")
    fig = plot_correlation_heatmap(pipeline_data)
    fig.show()  # Show the heatmap plot



def main_test_function():
    """Main function to test all utility functions."""
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    season_year = '2021'  # initially a string
    logger.debug(f"Original season_year: {season_year}, Type: {type(season_year)}")

    # Convert to integer if necessary
    if isinstance(season_year, str):
        season_year = int(season_year)
        logger.debug(f"Converted season_year to int: {season_year}, Type: {season_year}")

    model_save_path = f'../data/models/season_{season_year}'

    # Test check_or_train_model
    predictions_df = check_or_train_model(file_path, model_save_path, season_year)
    logger.debug(f"Predictions DataFrame:\n{predictions_df.head()}")

    # Test display_model_metrics
    display_model_metrics(model_save_path)

    # Load a model for testing feature importance
    rf_model_path = f'{model_save_path}/best_rf_model.pkl'
    rf_model = joblib.load(rf_model_path)
    feature_names_path = f'{model_save_path}/feature_names.pkl'
    feature_names = joblib.load(feature_names_path)

    # Test display_feature_importance with filtering categorical features
    feature_importances_df = display_feature_importance(rf_model, feature_names, CATEGORICAL_FEATURES)
    
    # Test plot_feature_importance
    plot = plot_feature_importance(feature_importances_df, "Random Forest")
    plot.show()

    # Test display_overpaid_underpaid
    display_overpaid_underpaid(predictions_df)

    # Test the data analysis functions
    test_data_analysis_functions()

if __name__ == "__main__":
    main_test_function()




Overwriting ../src/salary_model_training/util_functions.py
