In [2]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import os

# Define utility functions
def get_project_root():
    current_dir = os.getcwd()
    return os.path.dirname(os.path.dirname(current_dir))

def load_data(inflated=False):
    root_dir = get_project_root()
    file_name = 'final_salary_data_with_yos_and_inflated_cap.csv' if inflated else 'final_salary_data_with_yos_and_cap.csv'
    file_path = os.path.join(root_dir, 'data', 'processed', file_name)
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist. Please check the file path.")
    
    df = pd.read_csv(file_path)
    if 'Salary' not in df.columns:
        raise KeyError("The 'Salary' column is missing in the dataset.")
    df['Season'] = df['Season'].str[:4].astype(int)
    
    # Use the correct salary cap column
    if inflated:
        df['Salary Cap'] = df['Salary_Cap_Inflated']
    
    print(f"Loaded data (inflated={inflated}):")
    print(df.head())
    print(df.describe())
    return df

# Feature engineering and preprocessing functions
def feature_engineering(df, use_inflated_data=False):
    df['PPG'] = df['PTS'] / df['GP']
    df['APG'] = df['AST'] / df['GP']
    df['RPG'] = df['TRB'] / df['GP']
    df['SPG'] = df['STL'] / df['GP']
    df['BPG'] = df['BLK'] / df['GP']
    df['TOPG'] = df['TOV'] / df['GP']
    df['WinPct'] = df['Wins'] / (df['Wins'] + df['Losses'])
    df['Availability'] = df['GP'] / 82
    
    df['SalaryPct'] = df['Salary'] / df['Salary Cap']
    df['SalaryGrowth'] = df.groupby('Player')['SalaryPct'].pct_change().fillna(0)
    
    print("Feature engineered data:")
    print(df[['Player', 'PPG', 'APG', 'RPG', 'SalaryPct', 'SalaryGrowth']].head())
    return df

# Loading model and scaler
def load_model_and_scaler(model_name, inflated=False):
    root_dir = get_project_root()
    suffix = '_inflated' if inflated else ''
    
    model_path = os.path.join(root_dir, 'data', 'models', f'{model_name}_salary_prediction_model{suffix}.joblib')
    scaler_path = os.path.join(root_dir, 'data', 'models', f'scaler{suffix}.joblib')
    features_path = os.path.join(root_dir, 'data', 'models', f'selected_features{suffix}.joblib')

    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    selected_features = joblib.load(features_path)

    print(f"Loaded model and scaler (inflated={inflated}):")
    print(f"Selected features: {selected_features}")

    if hasattr(model, 'coef_'):
        print(f"Model coefficients: {model.coef_}")
    if hasattr(model, 'intercept_'):
        print(f"Model intercept: {model.intercept_}")

    if hasattr(model, 'feature_importances_'):
        print(f"Feature importances: {model.feature_importances_}")

    return model, scaler, selected_features

# Making predictions
def make_predictions(df, model, scaler, selected_features, use_inflated_data):
    df = feature_engineering(df, use_inflated_data)
    df['Age'] += 1
    df['Season'] += 1
    
    if not all(feature in df.columns for feature in selected_features):
        missing_features = [f for f in selected_features if f not in df.columns]
        raise ValueError(f"Missing features in dataframe: {missing_features}")
    
    X = df[selected_features]
    
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)
    print("Imputed data:")
    print(pd.DataFrame(X_imputed, columns=selected_features).head())
    
    X_scaled = scaler.transform(X_imputed)
    print("Scaled data:")
    print(pd.DataFrame(X_scaled, columns=selected_features).head())
    
    df['Predicted_Salary_Pct'] = model.predict(X_scaled)
    
    salary_cap_column = 'Salary_Cap_Inflated' if use_inflated_data else 'Salary Cap'
    
    if salary_cap_column not in df.columns:
        raise ValueError(f"Salary cap column '{salary_cap_column}' not found in dataframe")
    
    df['Predicted_Salary'] = df['Predicted_Salary_Pct'] * df[salary_cap_column]
    df['Salary_Change'] = df['Predicted_Salary'] - df['Salary']
    
    print(f"SalaryPct values used for predictions (inflated={use_inflated_data}):")
    print(df['SalaryPct'].head())
    print(f"Salary Cap column values (inflated={use_inflated_data}):")
    print(df[salary_cap_column].head())
    print("Predictions:")
    print(df[['Player', 'Position', 'Age', 'Salary', 'Predicted_Salary', 'Salary_Change']].head())
    return df

# Training and cross-validation
def train_model(df, model_name, inflated=False):
    model, scaler, selected_features = load_model_and_scaler(model_name, inflated)
    
    X = df[selected_features]
    y = df['SalaryPct']
    
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)
    X_scaled = scaler.fit_transform(X_imputed)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=5)
    print(f"Cross-validation scores (inflated={inflated}): {cv_scores}")
    print(f"Mean CV score (inflated={inflated}): {cv_scores.mean()}")
    
    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X_scaled, y)
    
    print(f"Best parameters (inflated={inflated}): {grid_search.best_params_}")
    print(f"Best score (inflated={inflated}): {grid_search.best_score_}")

# Main testing function
def main():
    model_name = 'random_forest'

    # Load and process data
    df_inflated = load_data(inflated=True)
    df_non_inflated = load_data(inflated=False)

    # Load model and make predictions for inflated data
    model_inflated, scaler_inflated, selected_features_inflated = load_model_and_scaler(model_name, inflated=True)
    predictions_inflated = make_predictions(df_inflated, model_inflated, scaler_inflated, selected_features_inflated, use_inflated_data=True)
    print(predictions_inflated[['Player', 'Position', 'Age', 'Salary', 'Predicted_Salary', 'Salary_Change']].head(10))
    
    # Cross-validation for Inflated Data
    train_model(df_inflated, model_name, inflated=True)

    # Load model and make predictions for non-inflated data
    model_non_inflated, scaler_non_inflated, selected_features_non_inflated = load_model_and_scaler(model_name, inflated=False)
    predictions_non_inflated = make_predictions(df_non_inflated, model_non_inflated, scaler_non_inflated, selected_features_non_inflated, use_inflated_data=False)
    print(predictions_non_inflated[['Player', 'Position', 'Age', 'Salary', 'Predicted_Salary', 'Salary_Change']].head(10))
    
    # Cross-validation for Non-Inflated Data
    train_model(df_non_inflated, model_name, inflated=False)

if __name__ == "__main__":
    main()


Loaded data (inflated=True):
          Player    Salary  Season       Position   Age Team        TeamID  \
0  Stephen Curry  51915615    2023          Guard  36.0  GSW  1.610613e+09   
1   Jrue Holiday  37620347    2023          Guard  34.0  BOS  1.610613e+09   
2   James Harden  35680595    2023          Guard  34.0  LAC  1.610613e+09   
3  DeMar DeRozan  28600000    2023  Guard-Forward  34.0  CHI  1.610613e+09   
4     Taj Gibson   4776302    2023        Forward  39.0  NYK  1.610613e+09   

   Years of Service    GP    GS  ...  DBPM  BPM  VORP  Wins  Losses  \
0              14.0  74.0  74.0  ...  -1.1  5.2   4.4  46.0    36.0   
1              14.0  69.0  69.0  ...   1.1  2.1   2.4  64.0    18.0   
2              14.0  72.0  72.0  ...   0.3  4.1   3.8  51.0    31.0   
3              14.0  79.0  79.0  ...  -0.3  1.8   2.8  39.0    43.0   
4              14.0  16.0   1.0  ...   0.1 -5.3  -0.2  14.0    68.0   

   Min Salary  Max Salary    Salary Cap  2022 Dollars  Salary_Cap_Inflated 



Cross-validation scores (inflated=True): [0.59054571 0.61373302 0.63228723 0.41114382 0.34030817]
Mean CV score (inflated=True): 0.5176035907877679
Best parameters (inflated=True): {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best score (inflated=True): 0.5185662582370855
Loaded model and scaler (inflated=False):
Selected features: ['Age', 'Years of Service', 'PPG', 'APG', 'RPG', 'SPG', 'TOPG', 'FT%', 'PER', 'Availability']
Feature importances: [0.02379492 0.29380221 0.5245575  0.0210303  0.01729976 0.03036544
 0.04282497 0.01068902 0.01897714 0.01665875]
Feature engineered data:
          Player        PPG       APG       RPG  SalaryPct  SalaryGrowth
0  Stephen Curry  26.432432  5.121622  4.459459   0.381674           0.0
1   Jrue Holiday  12.463768  4.826087  5.405797   0.276577           0.0
2   James Harden  16.555556  8.527778  5.125000   0.262317           0.0
3  DeMar DeRozan  24.012658  5.329114  4.303797   0.210262           0.0
4     T



Loaded model and scaler (inflated=False):
Selected features: ['Age', 'Years of Service', 'PPG', 'APG', 'RPG', 'SPG', 'TOPG', 'FT%', 'PER', 'Availability']
Feature importances: [0.02379492 0.29380221 0.5245575  0.0210303  0.01729976 0.03036544
 0.04282497 0.01068902 0.01897714 0.01665875]
Cross-validation scores (inflated=False): [0.82592686 0.79793971 0.8444336  0.76577415 0.7062056 ]
Mean CV score (inflated=False): 0.7880559857162807
Best parameters (inflated=False): {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best score (inflated=False): 0.7901542550463393
