In [58]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from scipy.stats import norm
import math


#Load player data and format into usable features and variables
#Link to kaggle page : https://www.kaggle.com/datasets/eoinamoore/historical-nba-data-and-player-box-scores?select=PlayerStatistics.csv

def load_and_preprocess_player_data(ppg_file, player_name=None):

    ppg_data = pd.read_csv(ppg_file)
    #print("Columns in loaded DataFrame:", ppg_data.columns)
    

    ppg_data['first_last'] = ppg_data['firstName'] + ' ' + ppg_data['lastName']

    if player_name:
        ppg_data = ppg_data[ppg_data['first_last'] == player_name]
        

    ppg_data['gameDate'] = pd.to_datetime(ppg_data['gameDate'])
    ppg_data = ppg_data.sort_values(by=['first_last', 'gameDate'])

    ppg_data['game_month'] = ppg_data['gameDate'].dt.month
    ppg_data['game_dayofweek'] = ppg_data['gameDate'].dt.dayofweek
    ppg_data['days_since_last_game'] = (
        ppg_data.groupby('first_last')['gameDate'].diff().dt.days.fillna(7)
    )

    ppg_data['rolling_avg_points_3'] = (
        ppg_data.groupby('first_last')['points']
        .rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
    )
    ppg_data['rolling_avg_points_5'] = (
        ppg_data.groupby('first_last')['points']
        .rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
    )

    ppg_data['mp_pts'] = ppg_data['numMinutes'] * ppg_data['points']
    ppg_data['mp_trb'] = ppg_data['numMinutes'] * ppg_data['reboundsTotal']

    ppg_data.fillna(ppg_data.mean(numeric_only=True), inplace=True)

    # Downcast numerics
    for col in ppg_data.select_dtypes(include='number').columns:
        ppg_data[col] = pd.to_numeric(ppg_data[col], downcast='float')

    # Define the features for the model
    features = [
        'numMinutes', 'blocks', 'steals', 'foulsPersonal', 'turnovers', 'days_since_last_game',
        'assists', 'fieldGoalsAttempted', 'fieldGoalsMade', 'threePointersAttempted', 'threePointersMade',
        'freeThrowsAttempted', 'freeThrowsMade', 'rolling_avg_points_3', 'rolling_avg_points_5',
        'plusMinusPoints', 'reboundsDefensive', 'reboundsOffensive', 'reboundsTotal',
        'mp_pts', 'mp_trb',
        'game_month', 'game_dayofweek', 'playerteamName', 'opponentteamName', 'home'
    ]

    return ppg_data, features



#Imports the excel file and gets the column of residuals to get the standard error of the estimate
def read_xls(residual_file_path, sheet_name=0, **kwargs):
    try:
        df = pd.read_excel(residual_file_path, sheet_name=sheet_name, engine='xlrd', **kwargs)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


# Load and preprocess opponent stats data
 def load_opponent_stats(opponent_file):
     opp_stats = pd.read_csv(opponent_file)
    
     # A lower rating means better offense (less opponent rebounds etc.)
     opp_stats['defensive_allowance_rating'] = opp_stats['team_defensive_allowance'] = 1 / (
                                             0.5 * opp_stats['opp_trb_per_game'] +
                                             0.3 * opp_stats['opp_stl_per_game'] +
                                             0.2 * opp_stats['opp_blk_per_game']
                                             ) * 10
     # Create a separate dataframe for defensive allowance
     team_defensive_allowance = opp_stats[['team', 'defensive_allowance_rating']]
    
     # Clean column values before merging
     team_defensive_allowance['team'] = team_defensive_allowance['team'].str.strip()

     return team_defensive_allowance