In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from scipy.stats import norm
import math


# Normalization helper
def normalize_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

# Train/Test Split and Model Training with Random Forest and Linear Regression
def train_models_with_comparison(data, features, target):
    X = data[features]
    y = data[target]

    # Fill and scale features
    X = X.fillna(X.mean())
    X_scaled, scaler = normalize_features(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_train_preds = lr_model.predict(X_train)
    lr_residuals = y_train - lr_train_preds
    lr_model.sigma_ = np.std(lr_residuals)
    
    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Evaluate
    lr_mse = mean_squared_error(y_test, lr_model.predict(X_test))
    rf_mse = mean_squared_error(y_test, rf_model.predict(X_test))

    
    print(f"Random Forest RMSE: {math.sqrt(rf_mse):.4f}")

    return lr_model, rf_model, scaler, X_train, X_test



# Prediction Interval for Linear Regression
def predict_with_prediction_interval(model, scaler, X_train_scaled, sample_features, alpha=0.05):
    sample_features_scaled = scaler.transform(sample_features)
    predicted_value = model.predict(sample_features_scaled)[0]
    z_score = norm.ppf(1 - alpha / 2)

    XTX_inv = np.linalg.pinv(X_train_scaled.T @ X_train_scaled)
    x_0 = sample_features_scaled[0].reshape(-1, 1)
    variance = model.sigma_**2 + (x_0.T @ XTX_inv @ x_0)[0, 0]

    margin_error = z_score * np.sqrt(variance)
    lower_bound = predicted_value - margin_error
    upper_bound = predicted_value + margin_error

    return predicted_value, lower_bound, upper_bound



# Simple prediction from Random Forest
def predict_rf(rf_model, scaler, sample_features):
    sample_scaled = scaler.transform(sample_features)
    return rf_model.predict(sample_scaled)[0]


def calculate_expected_value_from_prediction(
    predicted_points, sigma, sportsbook_line, odds, over=True
):
    # Calculate probability from normal distribution
    if over:
        prob_win = 1 - norm.cdf(sportsbook_line, loc=predicted_points, scale=sigma)
    else:
        prob_win = norm.cdf(sportsbook_line, loc=predicted_points, scale=sigma)
    
    prob_loss = 1 - prob_win

    # Convert odds to decimal
    if odds > 0:
        payout_ratio = odds / 100
    else:
        payout_ratio = 100 / abs(odds)

    # Expected value, assuming $100 bet
    ev = (prob_win * payout_ratio * 100) - (prob_loss * 100)
    
    return ev, prob_win



def calculate_betting_edge(blended_mean, std_dev, sportsbook_line, odds, bet_type='over'):
    """
    Calculate expected value and model probability for a given bet.
    """
    # Convert betting odds to implied probability
    if odds > 0:
        vegas_prob = 100 / (odds + 100)
    else:
        vegas_prob = -odds / (-odds + 100)

    # Calculate model probability
    if bet_type.lower() == 'over':
        model_prob = 1 - norm.cdf(sportsbook_line, loc=blended_mean, scale=std_dev)
    else:  # Under
        model_prob = norm.cdf(sportsbook_line, loc=blended_mean, scale=std_dev)

    # Calculate expected value (simple)
    payout_multiplier = odds / 100 if odds > 0 else 100 / abs(odds)
    ev = (model_prob * payout_multiplier) - (1 - model_prob)

    # Calculate model edge
    edge = model_prob - vegas_prob

    return ev, model_prob, vegas_prob, edge




#Functions used to calculate the mean without accounting for fluke / blowout games
def weighted_mean(df, stat_col='points', weight_col='numMinutes'):
    valid = df[(df[weight_col] > 0) & df[stat_col].notna()]
    weighted_sum = (valid[stat_col] * valid[weight_col]).sum()
    total_weight = valid[weight_col].sum()
    return weighted_sum / total_weight if total_weight > 0 else np.nan

def ewma_stat(df, stat_col='points', span=5):
    sorted_df = df.sort_values('gameDate')
    return sorted_df[stat_col].ewm(span=span, adjust=False).mean().iloc[-1]

def blended_player_stat(player_data, stat_col='points', span=5, wm_weight=0.6):
    wm = weighted_mean(player_data, stat_col=stat_col, weight_col='numMinutes')
    ewma = ewma_stat(player_data, stat_col=stat_col, span=span)
    
    if pd.isna(wm) and pd.isna(ewma):
        return np.nan
    elif pd.isna(wm):
        return ewma
    elif pd.isna(ewma):
        return wm
    else:
        return wm_weight * wm + (1 - wm_weight) * ewma
    
    

def adjust_prediction_with_matchup(player_data, opponent_team, player_team, blended_mean, home_game_flag, team_stats, target, print_log):
    """Adjusts the blended mean based on matchup stats, pace, and home/away effects, and prints each adjustment."""

    adjustment_log = []  # collect adjustment notes
    
    
     # --- Step 0: Blended Mean ---
    blended_mean = blended_player_stat(player_data, stat_col=target)
    if pd.isna(blended_mean):
        adjustment_log.append("Initial blended mean: Failed to calculate (NaN)")
        print("Adjustment Breakdown:")
        for adj in adjustment_log:
            print("  •", adj)
        return np.nan
    #adjustment_log.append(f"Initial blended mean: {blended_mean:.2f} {target}")
    
    

    # --- Get player's last 5 games vs opponent ---
    recent_vs_opponent = player_data[player_data['opponentteamName'] == opponent_team].tail(5)
    

    if not recent_vs_opponent.empty:
        opponent_mean = recent_vs_opponent[target].mean()
        overall_mean = player_data[target].mean()
        opponent_adjustment = (opponent_mean - overall_mean) * 0.5  # weight it moderately
        blended_mean += opponent_adjustment
        adjustment_log.append(f"Opponent adjustment: {opponent_adjustment:+.2f} {target}")
    else:
        adjustment_log.append("Opponent adjustment: None (no recent data)")

    # --- Calculate pace adjustment ---
    try:
        # Find team stats for both teams
        team_row = team_stats[team_stats['teamName'] == player_team].iloc[-1]
        
        opponent_row = team_stats[team_stats['opponentTeamName'] == opponent_team].iloc[-1]

        # Calculate total possessions for each team
        team_total_possessions = (
            team_row['fieldGoalsAttempted'] +
            0.44 * team_row['freeThrowsAttempted'] -
            team_row['reboundsOffensive'] +
            team_row['turnovers']
        )
        opponent_total_possessions = (
            opponent_row['fieldGoalsAttempted'] +
            0.44 * opponent_row['freeThrowsAttempted'] -
            opponent_row['reboundsOffensive'] +
            opponent_row['turnovers']
        )

        # Get minutes
        team_total_minutes = team_row['numMinutes']
        opponent_total_minutes = opponent_row['numMinutes']
        
        #different weights for different targets for pace adjustment
        stat_pace_weights = {
            'points': 0.5,
            'reboundsTotal': 0.3,
            'assists': 0.2,
            'steals': 0.1,
            'blocks': 0.1,
            
        }


        

        if pd.notna(team_total_possessions) and pd.notna(opponent_total_possessions) and pd.notna(team_total_minutes) and pd.notna(opponent_total_minutes) and team_total_minutes > 0 and opponent_total_minutes > 0:
            team_pace = (240 / team_total_minutes) * (team_total_possessions)
            
            opponent_pace = (240 / opponent_total_minutes) * (opponent_total_possessions)

            # Average the two paces
            game_pace = (team_pace + opponent_pace) / 2
            #Average game pace
            league_average_pace = 100

            #Gets the difference in average pace with that game pace, gets the corresponding weight from 
            #the feature and adjusts the target prediction 
            pace_diff_percent = (game_pace - league_average_pace) / league_average_pace
            pace_weight = stat_pace_weights.get(target, 0.3)
            pace_adjustment = pace_diff_percent * blended_mean * pace_weight
            blended_mean += pace_adjustment
            
            adjustment_log.append(f"Pace adjustment: {pace_adjustment:+.2f} {target} (Game Pace {game_pace:.1f}, team_pace = {team_pace}, opponent_pace = {opponent_pace}, League Pace {league_average_pace:.1f})")
        else:
            adjustment_log.append(f"Pace adjustment: Skipped because poss = {team_total_possessions}, team_min = {team_total_minutes}, opp_minutes = {opponent_total_minutes}")
    except Exception as e:
        adjustment_log.append(f"Pace adjustment: Failed ({e})")

    # --- Home/Away adjustment ---
    if home_game_flag == 1:
        home_adjustment = 0.02 * blended_mean  # boost 2% for home
        blended_mean += home_adjustment
        adjustment_log.append(f"Home game adjustment: {home_adjustment:+.2f} {target}")
    else:
        away_adjustment = -0.02 * blended_mean  # penalty 2% for away
        blended_mean += away_adjustment
        adjustment_log.append(f"Away game adjustment: {away_adjustment:+.2f} {target}")

    # --- Print all adjustment steps ---
    if print_log:
        print("\nAdjustment Breakdown:")
        for adj in adjustment_log:
            print("  •", adj)

    return blended_mean, adjustment_log


# monte carlo based prediction 
def monte_carlo_adjustment(player_data, opponent_team, player_team, home_game_flag, team_stats, target, sportsbook_line, bet_type='over', n=1000, noise_std=0.05):
    """
    Runs a Monte Carlo simulation by injecting noise into the player's stat distribution and applying the matchup adjustment.
    Returns the empirical probability of hitting the bet line (over/under) and expected value.
    """

    # Precompute the base blended mean
    base_blended_mean = blended_player_stat(player_data, stat_col=target)

    if pd.isna(base_blended_mean):
        print("Cannot run Monte Carlo: blended mean is NaN")
        return None, None

    simulated_predictions = []

    # Generate one noisy sample to print the adjustment log
    noisy_data = player_data.copy()
    noise = np.random.normal(loc=0, scale=noise_std * base_blended_mean, size=len(noisy_data))
    noisy_data[target] = noisy_data[target] + noise
    noisy_data[target] = noisy_data[target].clip(lower=0)

    # Print adjustment log once
    first_adjusted, adjustment_log = adjust_prediction_with_matchup(
        noisy_data, opponent_team, player_team, base_blended_mean, home_game_flag, team_stats, target, print_log=True
    )
    if not pd.isna(first_adjusted):
        simulated_predictions.append(first_adjusted)

    # Run rest of the simulations silently
    for _ in range(n - 1):
        noisy_data = player_data.copy()
        noise = np.random.normal(loc=0, scale=noise_std * base_blended_mean, size=len(noisy_data))
        noisy_data[target] = noisy_data[target] + noise
        noisy_data[target] = noisy_data[target].clip(lower=0)

        adjusted_prediction, _ = adjust_prediction_with_matchup(
            noisy_data, opponent_team, player_team, base_blended_mean, home_game_flag, team_stats, target, print_log=False
        )

        if not pd.isna(adjusted_prediction):
            simulated_predictions.append(adjusted_prediction)

    # Calculate Monte Carlo-based probability
    simulated_predictions = np.array(simulated_predictions)

    if bet_type == 'over':
        prob = np.mean(simulated_predictions > sportsbook_line)
    else:
        prob = np.mean(simulated_predictions < sportsbook_line)

    # Compute expected value (assuming $100 bet)
    payout_ratio = sportsbook_line / 100 if sportsbook_line > 0 else 100 / abs(sportsbook_line)
    ev = (prob * payout_ratio * 100) - ((1 - prob) * 100)

    print(f"Monte Carlo {bet_type.upper()} Prob: {prob:.3f} | Expected Value: ${ev:.2f}")
    
    
    
def main():
    # --- File paths and player/team setup ---
    ppg_file = '/Users/davislaroque/Desktop/NBA Project/game-by-game-stats/filtered_player_data.csv'
    team_stats_file = '/Users/davislaroque/Desktop/NBA Project/game-by-game-stats/filtered_team_stats.csv'
    residual_file_path = '/Users/davislaroque/Downloads/NBA Project Results Summary.xls'

    
    target = 'reboundsTotal'
    player_name = "Anthony Edwards"
    player_team = "Timberwolves"
    opponent_team = "Thunder"
    home_game_flag = 0  # 1 for Home, 0 for Away

    # --- Load and prepare data ---
    player_data, features = load_and_preprocess_player_data(ppg_file, player_name)
    team_stats = pd.read_csv(team_stats_file)
    team_stats = team_stats[pd.to_datetime(team_stats['gameDate']) >= pd.to_datetime("2024-10-22")]
    
    if player_data.empty:
        print(f"No data found for player: {player_name}")
        return
    
    #gets the residuals standard error
    df = read_xls(residual_file_path)
    residuals = df.Residuals
    residuals_std = residuals.std()

    #drop team/categorical columns from model training
    features = [f for f in features if f not in ['playerteamName', 'opponentteamName']]

    
    # Remove target column from features to prevent leakage
    if target == 'reboundsTotal':
        features = [f for f in features if f not in ['reboundsTotal', 'mp_trb']]
    elif target == 'points':
        features = [f for f in features if f not in ['points', 'mp_pts']]
        

    # --- Train models ---
    lr_model, rf_model, scaler, X_train_scaled, X_test_scaled = train_models_with_comparison(
        player_data, features, target
    )

    # --- Prepare latest sample for prediction ---
    player_row = player_data.iloc[[-1]]  # most recent game
    numMinutes = player_row["numMinutes"].values[0]
    fieldGoalsAttempted = player_row["fieldGoalsAttempted"].values[0]
    sample_features = player_row[features].values

    alpha = 0.05  # Confidence level for prediction interval

    # --- Linear Regression prediction with interval ---
    lr_pred, lower, upper = predict_with_prediction_interval(
        lr_model, scaler, X_train_scaled, sample_features
    )

    # --- Random Forest prediction ---
    sample_df = pd.DataFrame(sample_features, columns=features)
    tree_preds = [tree.predict(scaler.transform(sample_df))[0] for tree in rf_model.estimators_]
    rf_pred = np.mean(tree_preds)
    rf_std = np.std(tree_preds)

    # --- Blended Mean (weighted average of LR and RF) ---
    blended_mean = (0.6 * rf_pred) + (0.4 * lr_pred)

    # --- Estimate Standard Deviation (blended) ---
    historical_std = player_data[target].tail(20).std()
    blended_std = (0.6 * historical_std) + (0.4 * rf_std)
    blended_std = max(blended_std, 0.1 * rf_pred)

    # --- Adjust blended_mean with matchup, pace, home/away ---
    adjusted_blended_mean, _ = adjust_prediction_with_matchup(
        player_data=player_data,
        opponent_team=opponent_team,
        player_team=player_team,
        blended_mean=blended_mean,
        home_game_flag=home_game_flag,
        team_stats=team_stats,
        target = target,
        print_log = False
    )

    


    # --- Sportsbook Info (manual input) ---
    sportsbook_line =8.5
    odds = +100
    bet_type = 'over'

    
    
    model_prob, ev = monte_carlo_adjustment(
    player_data=player_data,
    opponent_team=opponent_team,
    player_team=player_team,
    home_game_flag=home_game_flag,
    team_stats=team_stats,
    target=target,
    sportsbook_line=sportsbook_line,
    bet_type=bet_type,
    n=1000,
    noise_std = residuals_std
    )
    
    # --- Calculate EV, model prob, vegas prob, edge ---
    ev, model_prob, vegas_prob, edge = calculate_betting_edge(
        adjusted_blended_mean, blended_std, sportsbook_line, odds, bet_type
    )

    # --- Output ---

    print(f"\n--- {player_name} ---")
    print(f"Averaging {numMinutes:.1f} minutes with {fieldGoalsAttempted:.1f} FGA/game\n")
    print(f"Model predicted mean: {adjusted_blended_mean:.2f} {target} (adjusted)")
    print(f"Model standard deviation: {blended_std:.2f}\n")
    #print(f"Random Forest raw prediction: {rf_pred:.2f} {target}\n")
    #print(f"Linear Regression raw prediction: {lr_pred:.2f} {target}")
    print(f"95% Prediction Interval for Linear Regression: ({lower:.2f}, {upper:.2f})\n")

    print(f"Vegas {bet_type} line: {sportsbook_line} at odds {odds}")
    print(f"Vegas implied probability: {vegas_prob * 100:.2f}%")
    print(f"Model probability of winning bet: {model_prob * 100:.2f}%")
    print(f"Edge over Vegas: {edge * 100:.2f}%")
    print(f"Expected Value (EV): ${ev:.2f}\n")


if __name__ == "__main__":
    main()