In [None]:
# 3P

In [3]:
import os
import pandas as pd
import numpy as np
import joblib
import requests
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.compose import ColumnTransformer
from sqlalchemy import create_engine
import logging
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
import time

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

input_csv = "nba_historical_stats.csv"
try:
    df = pd.read_csv(input_csv)
    logging.info(f"Dataset loaded: {input_csv}")
except FileNotFoundError:
    logging.error(f"Dataset not found: {input_csv}")
    raise SystemExit


# DB_CONNECTION = "mysql+mysqlconnector://root:Adex127!Apple@localhost:3306/nba_analysis" # Change creds
# TABLE_NAME = "historical_data_table"

# engine = create_engine(DB_CONNECTION)
# try:
#     with engine.connect() as connection:
#         df = pd.read_sql(f"SELECT * FROM {TABLE_NAME} WHERE Player != 'Team Totals'", con=connection)
#     logging.info(f"Dataset loaded: {TABLE_NAME}")
# except Exception as e:
#     logging.error(f"Dataset not loaded: {e}")
#     raise SystemExit

df_team_totals = df[df["Player"] == "Team Totals"]
df_players = df[df["Player"] != "Team Totals"]
df = df_players.copy()
logging.info("Team totals separated from player data.")

remove_columns = ["Age","Rk","Player","TEAM"]
df.drop(columns=remove_columns, inplace=True)
logging.info(f"Columns removed: {remove_columns}")

numeric_features = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy="median")
df[numeric_features] = imputer.fit_transform(df[numeric_features])
logging.info(f"Missing values handled using median strategy.")

#df.drop_duplicates(inplace=True)
#df.loc[:, df.select_dtypes(include=[np.number]).columns] = df.select_dtypes(include=[np.number]).fillna(df.mean())
#df.fillna(df.mean(), inplace=True)

target_column = "3P"
if target_column not in df.columns:
    logging.error(f"Column {target_column} not found in dataset")
    raise ValueError("Target column not found")

x = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
logging.info(f"X_train shape: {X_train.shape}")

# Feature Selection
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Forest Regression
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
selector = RFECV(estimator=rf_model, step=5, cv=5, scoring="r2", n_jobs=-1, min_features_to_select=5)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature names
selected_features = X_train.columns[selector.support_]
logging.info(f"Selected features: {selected_features}")

X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)

print("\n Selected Important Features:")
print(selected_features)

start_time = time.time()
rf_model.fit(X_train_selected, y_train)
rf_time = time.time() - start_time

rf_y_pred = rf_model.predict(X_test_selected)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

# Uncomment to compare forest regression and gradient boosting (Forest regression was best)
# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

start_time = time.time()
gb_model.fit(X_train_selected, y_train)
gb_time = time.time() - start_time

gb_y_pred = gb_model.predict(X_test_selected)
gb_mae = mean_absolute_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)

#print(f"Training is complete")
#print(f"MAE: {mae}")
#print(f"R2: {r2}")

print(f"Random Forest - MAE: {rf_mae:.4f}, R²: {rf_r2:.4f}, Training Time: {rf_time:.2f} sec")
print(f"Gradient Boosting - MAE: {gb_mae:.4f}, R²: {gb_r2:.4f}, Training Time: {gb_time:.2f} sec")

# Save the Best Model
if rf_r2 > gb_r2:
    joblib.dump(rf_model, "3P_best_model.pkl")
    logging.info("Best Model (Random Forest) saved as '3P_best_model.pkl'")
else:
    joblib.dump(gb_model, "3P_best_model.pkl")
    logging.info("Best Model (Gradient Boosting) saved as '3P_best_model.pkl'")

# Uncomment these after testing Forest and boosting
#joblib.dump(scaler, "scaler.pkl")
#joblib.dump(model, "model.pkl")

#logging.info(f"Model saved: {model}")
print(f"Training is complete.")

2025-03-26 19:01:15,296 - INFO - Dataset loaded: nba_historical_stats.csv
2025-03-26 19:01:15,303 - INFO - Team totals separated from player data.
2025-03-26 19:01:15,308 - INFO - Columns removed: ['Age', 'Rk', 'Player', 'TEAM']
2025-03-26 19:01:15,364 - INFO - Missing values handled using median strategy.
2025-03-26 19:01:15,376 - INFO - X_train shape: (16531, 20)
2025-03-26 19:03:57,451 - INFO - Selected features: Index(['GS', 'FG%', '3PA', '3P%', 'AST'], dtype='object')



 Selected Important Features:
Index(['GS', 'FG%', '3PA', '3P%', 'AST'], dtype='object')


2025-03-26 19:04:10,166 - INFO - Best Model (Gradient Boosting) saved as '3P_best_model.pkl'


Random Forest - MAE: 0.0111, R²: 0.9983, Training Time: 6.72 sec
Gradient Boosting - MAE: 0.0161, R²: 0.9983, Training Time: 5.84 sec
Training is complete.


In [4]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import logging

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

def predict_3p_for_new_season(new_data_path, historical_data_path, player_names_included=True):
    """
    Predict 3P for NBA players in the 2025-26 season without relying on saved scaler.
    
    Parameters:
    new_data_path (str): Path to the CSV file containing new player stats
    historical_data_path (str): Path to the CSV file containing historical stats used for training
    player_names_included (bool): Whether the input CSV contains player names
    
    Returns:
    DataFrame: DataFrame with player names and predicted 3P
    """
    # load the model that we got from the training program 
    logging.info("Loading trained model")
    best_model = joblib.load("3P_best_model.pkl")
    
    # get the selected features by looking at the model's feature names
    try:
        selected_features = best_model.feature_names_in_
        logging.info(f"Retrieved {len(selected_features)} features from model")
    except AttributeError:
        # ff feature names aren't stored in the model, we need to reconstruct the feature selection process
        logging.warning("Model doesn't contain feature names, using hardcoded selected features")
        # replace with the actual features printed during training
        selected_features = [
            'GS', 'FG', 'FGA', 'FG%', '3PA', '3P%', 'ORB', 'AST', 'PF', 'YR'
        ]
    
    # load historical data to recreate the scaler
    logging.info(f"Loading historical data from {historical_data_path}")
    try:
        historical_df = pd.read_csv(historical_data_path)
    except FileNotFoundError:
        logging.error(f"Historical dataset not found: {historical_data_path}")
        raise SystemExit
    
    # store target column separately
    target_column = "3P"
    
    # filter out team totals as we did in the training_program
    historical_df = historical_df[historical_df["Player"] != "Team Totals"].copy()
    
    # remove columns that we do not need for prediction
    remove_columns = ["Age", "Rk", "Player", "TEAM"]
    historical_df.drop(columns=remove_columns, inplace=True, errors='ignore')
    
    # drop the target column (3P) before preprocessing
    x_historical = historical_df.drop(columns=[target_column])
    
    # recreate the imputer and scaler on historical data
    logging.info("Recreating preprocessing steps from training")
    numeric_features = x_historical.select_dtypes(include=np.number).columns
    imputer = SimpleImputer(strategy="median")
    x_historical[numeric_features] = imputer.fit_transform(x_historical[numeric_features])
    
    # recreate the scaler
    scaler = StandardScaler()
    scaler.fit(x_historical)
    logging.info("Scaler recreated with historical data")
    
    # load and prepare new player data
    logging.info(f"Loading new player data from {new_data_path}")
    try:
        new_players_df = pd.read_csv(new_data_path)
    except FileNotFoundError:
        logging.error(f"File not found: {new_data_path}")
        raise SystemExit
    
    # store player names if included in the input
    if player_names_included:
        player_names = new_players_df["Player"].copy()
        team_names = new_players_df["TEAM"].copy() if "TEAM" in new_players_df.columns else None
    
    # remove columns not used in training
    new_players_df.drop(columns=remove_columns, inplace=True, errors='ignore')
    
    # remove the target column from new data if it exists
    if target_column in new_players_df.columns:
        logging.info(f"Removing {target_column} column from new data as it's the prediction target")
        new_players_df.drop(columns=[target_column], inplace=True)
    
    # ensure the new data has exactly the same columns as the training data
    missing_cols = set(x_historical.columns) - set(new_players_df.columns)
    if missing_cols:
        logging.warning(f"Missing columns in new data: {missing_cols}")
        for col in missing_cols:
            new_players_df[col] = 0  # Fill with zeros as placeholder
    
    # ensure columns are in the same order
    new_players_df = new_players_df[x_historical.columns]
    
    # handle missing values
    numeric_features = new_players_df.select_dtypes(include=np.number).columns
    new_players_df[numeric_features] = imputer.transform(new_players_df[numeric_features])
    
    # transform and select features
    logging.info("Scaling features and selecting relevant ones")
    new_players_scaled = pd.DataFrame(
        scaler.transform(new_players_df), 
        columns=new_players_df.columns
    )
    
    # ensure only selected features are used
    if set(selected_features).issubset(set(new_players_scaled.columns)):
        new_players_selected = new_players_scaled[selected_features]
    else:
        logging.error(f"Selected features {selected_features} not found in scaled data columns: {new_players_scaled.columns}")
        raise ValueError("Feature mismatch between model and input data")
    
    # make predictions
    logging.info("Making predictions")
    predicted_3p = best_model.predict(new_players_selected)
    
    # create results dataframe
    if player_names_included:
        results = {
            "Player": player_names,
            "Predicted_3P": predicted_3p
        }
        if team_names is not None:
            results["TEAM"] = team_names
        
        results_df = pd.DataFrame(results)
    else:
        results_df = pd.DataFrame({"Predicted_3P": predicted_3p})
    
    logging.info("Prediction complete")
    return results_df

if __name__ == "__main__":
    historical_data_path = "nba_historical_stats.csv"  # same file used for training
    new_data_path = "nba_player_stats_nba_api_2024-25.csv"  # data for predictions
    
    predictions = predict_3p_for_new_season(new_data_path, historical_data_path)
    
    # save predictions
    predictions.to_csv("predicted_3p_2025-26.csv", index=False)
    
    print("\nTop 10 players by predicted 3P:")
    print(predictions.sort_values("Predicted_3P", ascending=False).head(10))

2025-03-26 19:06:38,125 - INFO - Loading trained model
2025-03-26 19:06:38,164 - INFO - Retrieved 5 features from model
2025-03-26 19:06:38,164 - INFO - Loading historical data from nba_historical_stats.csv
2025-03-26 19:06:38,268 - INFO - Recreating preprocessing steps from training
2025-03-26 19:06:38,344 - INFO - Scaler recreated with historical data
2025-03-26 19:06:38,345 - INFO - Loading new player data from nba_player_stats_nba_api_2024-25.csv
2025-03-26 19:06:38,352 - INFO - Removing 3P column from new data as it's the prediction target
2025-03-26 19:06:38,361 - INFO - Scaling features and selecting relevant ones
2025-03-26 19:06:38,365 - INFO - Making predictions
2025-03-26 19:06:38,373 - INFO - Prediction complete



Top 10 players by predicted 3P:
              Player  Predicted_3P TEAM
489    Stephen Curry      4.386015  GSW
57    Brandon Miller      4.209706  CHA
29   Anthony Edwards      4.031296  MIN
347      LaMelo Ball      4.012969  CHA
370    Malik Beasley      3.799367  DET
258     Jayson Tatum      3.773868  BOS
138    Derrick White      3.483326  BOS
528      Tyler Herro      3.401465  MIA
360      Luka Dončić      3.358819  LAL
111   Damian Lillard      3.346964  MIL


In [None]:
# 3PA

In [5]:
import os
import pandas as pd
import numpy as np
import joblib
import requests
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.compose import ColumnTransformer
from sqlalchemy import create_engine
import logging
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
import time

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

input_csv = "nba_historical_stats.csv"
try:
    df = pd.read_csv(input_csv)
    logging.info(f"Dataset loaded: {input_csv}")
except FileNotFoundError:
    logging.error(f"Dataset not found: {input_csv}")
    raise SystemExit


# DB_CONNECTION = "mysql+mysqlconnector://root:Adex127!Apple@localhost:3306/nba_analysis" # Change creds
# TABLE_NAME = "historical_data_table"

# engine = create_engine(DB_CONNECTION)
# try:
#     with engine.connect() as connection:
#         df = pd.read_sql(f"SELECT * FROM {TABLE_NAME} WHERE Player != 'Team Totals'", con=connection)
#     logging.info(f"Dataset loaded: {TABLE_NAME}")
# except Exception as e:
#     logging.error(f"Dataset not loaded: {e}")
#     raise SystemExit

df_team_totals = df[df["Player"] == "Team Totals"]
df_players = df[df["Player"] != "Team Totals"]
df = df_players.copy()
logging.info("Team totals separated from player data.")

remove_columns = ["Age","Rk","Player","TEAM"]
df.drop(columns=remove_columns, inplace=True)
logging.info(f"Columns removed: {remove_columns}")

numeric_features = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy="median")
df[numeric_features] = imputer.fit_transform(df[numeric_features])
logging.info(f"Missing values handled using median strategy.")

#df.drop_duplicates(inplace=True)
#df.loc[:, df.select_dtypes(include=[np.number]).columns] = df.select_dtypes(include=[np.number]).fillna(df.mean())
#df.fillna(df.mean(), inplace=True)

target_column = "3PA"
if target_column not in df.columns:
    logging.error(f"Column {target_column} not found in dataset")
    raise ValueError("Target column not found")

x = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
logging.info(f"X_train shape: {X_train.shape}")

# Feature Selection
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Forest Regression
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
selector = RFECV(estimator=rf_model, step=5, cv=5, scoring="r2", n_jobs=-1, min_features_to_select=5)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature names
selected_features = X_train.columns[selector.support_]
logging.info(f"Selected features: {selected_features}")

X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)

print("\n Selected Important Features:")
print(selected_features)

start_time = time.time()
rf_model.fit(X_train_selected, y_train)
rf_time = time.time() - start_time

rf_y_pred = rf_model.predict(X_test_selected)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

# Uncomment to compare forest regression and gradient boosting (Forest regression was best)
# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

start_time = time.time()
gb_model.fit(X_train_selected, y_train)
gb_time = time.time() - start_time

gb_y_pred = gb_model.predict(X_test_selected)
gb_mae = mean_absolute_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)

#print(f"Training is complete")
#print(f"MAE: {mae}")
#print(f"R2: {r2}")

print(f"Random Forest - MAE: {rf_mae:.4f}, R²: {rf_r2:.4f}, Training Time: {rf_time:.2f} sec")
print(f"Gradient Boosting - MAE: {gb_mae:.4f}, R²: {gb_r2:.4f}, Training Time: {gb_time:.2f} sec")

# Save the Best Model
if rf_r2 > gb_r2:
    joblib.dump(rf_model, "3PA_best_model.pkl")
    logging.info("Best Model (Random Forest) saved as '3PA_best_model.pkl'")
else:
    joblib.dump(gb_model, "3PA_best_model.pkl")
    logging.info("Best Model (Gradient Boosting) saved as '3PA_best_model.pkl'")

# Uncomment these after testing Forest and boosting
#joblib.dump(scaler, "scaler.pkl")
#joblib.dump(model, "model.pkl")

#logging.info(f"Model saved: {model}")
print(f"Training is complete.")

2025-03-26 19:06:45,423 - INFO - Dataset loaded: nba_historical_stats.csv
2025-03-26 19:06:45,433 - INFO - Team totals separated from player data.
2025-03-26 19:06:45,437 - INFO - Columns removed: ['Age', 'Rk', 'Player', 'TEAM']
2025-03-26 19:06:45,521 - INFO - Missing values handled using median strategy.
2025-03-26 19:06:45,536 - INFO - X_train shape: (16531, 20)
2025-03-26 19:09:38,866 - INFO - Selected features: Index(['GS', 'FG', 'FGA', 'FG%', '3P', '3P%', 'ORB', 'AST', 'PF', 'YR'], dtype='object')



 Selected Important Features:
Index(['GS', 'FG', 'FGA', 'FG%', '3P', '3P%', 'ORB', 'AST', 'PF', 'YR'], dtype='object')


2025-03-26 19:10:01,770 - INFO - Best Model (Gradient Boosting) saved as '3PA_best_model.pkl'


Random Forest - MAE: 0.0734, R²: 0.9958, Training Time: 13.38 sec
Gradient Boosting - MAE: 0.0744, R²: 0.9958, Training Time: 9.34 sec
Training is complete.


In [6]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import logging

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

def predict_3pa_for_new_season(new_data_path, historical_data_path, player_names_included=True):
    """
    Predict 3PA for NBA players in the 2025-26 season without relying on saved scaler.
    
    Parameters:
    new_data_path (str): Path to the CSV file containing new player stats
    historical_data_path (str): Path to the CSV file containing historical stats used for training
    player_names_included (bool): Whether the input CSV contains player names
    
    Returns:
    DataFrame: DataFrame with player names and predicted 3PA
    """
    # load the model that we got from the training program 
    logging.info("Loading trained model")
    best_model = joblib.load("3PA_best_model.pkl")
    
    # get the selected features by looking at the model's feature names
    try:
        selected_features = best_model.feature_names_in_
        logging.info(f"Retrieved {len(selected_features)} features from model")
    except AttributeError:
        # ff feature names aren't stored in the model, we need to reconstruct the feature selection process
        logging.warning("Model doesn't contain feature names, using hardcoded selected features")
        # replace with the actual features printed during training
        selected_features = [
            'GS', 'FG', 'FGA', 'FG%', '3P', '3P%', 'ORB', 'AST', 'PF', 'YR'
        ]
    
    # load historical data to recreate the scaler
    logging.info(f"Loading historical data from {historical_data_path}")
    try:
        historical_df = pd.read_csv(historical_data_path)
    except FileNotFoundError:
        logging.error(f"Historical dataset not found: {historical_data_path}")
        raise SystemExit
    
    # store target column separately
    target_column = "3PA"
    
    # filter out team totals as we did in the training_program
    historical_df = historical_df[historical_df["Player"] != "Team Totals"].copy()
    
    # remove columns that we do not need for prediction
    remove_columns = ["Age", "Rk", "Player", "TEAM"]
    historical_df.drop(columns=remove_columns, inplace=True, errors='ignore')
    
    # drop the target column (3PA) before preprocessing
    x_historical = historical_df.drop(columns=[target_column])
    
    # recreate the imputer and scaler on historical data
    logging.info("Recreating preprocessing steps from training")
    numeric_features = x_historical.select_dtypes(include=np.number).columns
    imputer = SimpleImputer(strategy="median")
    x_historical[numeric_features] = imputer.fit_transform(x_historical[numeric_features])
    
    # recreate the scaler
    scaler = StandardScaler()
    scaler.fit(x_historical)
    logging.info("Scaler recreated with historical data")
    
    # load and prepare new player data
    logging.info(f"Loading new player data from {new_data_path}")
    try:
        new_players_df = pd.read_csv(new_data_path)
    except FileNotFoundError:
        logging.error(f"File not found: {new_data_path}")
        raise SystemExit
    
    # store player names if included in the input
    if player_names_included:
        player_names = new_players_df["Player"].copy()
        team_names = new_players_df["TEAM"].copy() if "TEAM" in new_players_df.columns else None
    
    # remove columns not used in training
    new_players_df.drop(columns=remove_columns, inplace=True, errors='ignore')
    
    # remove the target column from new data if it exists
    if target_column in new_players_df.columns:
        logging.info(f"Removing {target_column} column from new data as it's the prediction target")
        new_players_df.drop(columns=[target_column], inplace=True)
    
    # ensure the new data has exactly the same columns as the training data
    missing_cols = set(x_historical.columns) - set(new_players_df.columns)
    if missing_cols:
        logging.warning(f"Missing columns in new data: {missing_cols}")
        for col in missing_cols:
            new_players_df[col] = 0  # Fill with zeros as placeholder
    
    # ensure columns are in the same order
    new_players_df = new_players_df[x_historical.columns]
    
    # handle missing values
    numeric_features = new_players_df.select_dtypes(include=np.number).columns
    new_players_df[numeric_features] = imputer.transform(new_players_df[numeric_features])
    
    # transform and select features
    logging.info("Scaling features and selecting relevant ones")
    new_players_scaled = pd.DataFrame(
        scaler.transform(new_players_df), 
        columns=new_players_df.columns
    )
    
    # ensure only selected features are used
    if set(selected_features).issubset(set(new_players_scaled.columns)):
        new_players_selected = new_players_scaled[selected_features]
    else:
        logging.error(f"Selected features {selected_features} not found in scaled data columns: {new_players_scaled.columns}")
        raise ValueError("Feature mismatch between model and input data")
    
    # make predictions
    logging.info("Making predictions")
    predicted_3pa = best_model.predict(new_players_selected)
    
    # create results dataframe
    if player_names_included:
        results = {
            "Player": player_names,
            "Predicted_3PA": predicted_3pa
        }
        if team_names is not None:
            results["TEAM"] = team_names
        
        results_df = pd.DataFrame(results)
    else:
        results_df = pd.DataFrame({"Predicted_3PA": predicted_3pa})
    
    logging.info("Prediction complete")
    return results_df

if __name__ == "__main__":
    historical_data_path = "nba_historical_stats.csv"  # same file used for training
    new_data_path = "nba_player_stats_nba_api_2024-25.csv"  # data for predictions
    
    predictions = predict_3pa_for_new_season(new_data_path, historical_data_path)
    
    # save predictions
    predictions.to_csv("predicted_3pa_2025-26.csv", index=False)
    
    print("\nTop 10 players by predicted 3PA:")
    print(predictions.sort_values("Predicted_3PA", ascending=False).head(10))

2025-03-26 19:12:06,389 - INFO - Loading trained model
2025-03-26 19:12:06,414 - INFO - Retrieved 10 features from model
2025-03-26 19:12:06,415 - INFO - Loading historical data from nba_historical_stats.csv
2025-03-26 19:12:06,500 - INFO - Recreating preprocessing steps from training
2025-03-26 19:12:06,579 - INFO - Scaler recreated with historical data
2025-03-26 19:12:06,579 - INFO - Loading new player data from nba_player_stats_nba_api_2024-25.csv
2025-03-26 19:12:06,586 - INFO - Removing 3PA column from new data as it's the prediction target
2025-03-26 19:12:06,594 - INFO - Scaling features and selecting relevant ones
2025-03-26 19:12:06,598 - INFO - Making predictions
2025-03-26 19:12:06,606 - INFO - Prediction complete



Top 10 players by predicted 3PA:
               Player  Predicted_3PA TEAM
347       LaMelo Ball      11.035712  CHA
489     Stephen Curry      10.933861  GSW
29    Anthony Edwards      10.275397  MIN
57     Brandon Miller      10.178646  CHA
258      Jayson Tatum       9.707881  BOS
360       Luka Dončić       9.579534  LAL
528       Tyler Herro       9.239410  MIA
370     Malik Beasley       9.206399  DET
287      Jordan Poole       9.130966  WAS
148  Donovan Mitchell       9.115890  CLE


In [None]:
# 3P%

In [7]:
import os
import pandas as pd
import numpy as np
import joblib
import requests
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.compose import ColumnTransformer
from sqlalchemy import create_engine
import logging
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
import time

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

input_csv = "nba_historical_stats.csv"
try:
    df = pd.read_csv(input_csv)
    logging.info(f"Dataset loaded: {input_csv}")
except FileNotFoundError:
    logging.error(f"Dataset not found: {input_csv}")
    raise SystemExit


# DB_CONNECTION = "mysql+mysqlconnector://root:Adex127!Apple@localhost:3306/nba_analysis" # Change creds
# TABLE_NAME = "historical_data_table"

# engine = create_engine(DB_CONNECTION)
# try:
#     with engine.connect() as connection:
#         df = pd.read_sql(f"SELECT * FROM {TABLE_NAME} WHERE Player != 'Team Totals'", con=connection)
#     logging.info(f"Dataset loaded: {TABLE_NAME}")
# except Exception as e:
#     logging.error(f"Dataset not loaded: {e}")
#     raise SystemExit

df_team_totals = df[df["Player"] == "Team Totals"]
df_players = df[df["Player"] != "Team Totals"]
df = df_players.copy()
logging.info("Team totals separated from player data.")

remove_columns = ["Age","Rk","Player","TEAM"]
df.drop(columns=remove_columns, inplace=True)
logging.info(f"Columns removed: {remove_columns}")

numeric_features = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy="median")
df[numeric_features] = imputer.fit_transform(df[numeric_features])
logging.info(f"Missing values handled using median strategy.")

#df.drop_duplicates(inplace=True)
#df.loc[:, df.select_dtypes(include=[np.number]).columns] = df.select_dtypes(include=[np.number]).fillna(df.mean())
#df.fillna(df.mean(), inplace=True)

target_column = "3P%"
if target_column not in df.columns:
    logging.error(f"Column {target_column} not found in dataset")
    raise ValueError("Target column not found")

x = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
logging.info(f"X_train shape: {X_train.shape}")

# Feature Selection
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Forest Regression
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
selector = RFECV(estimator=rf_model, step=5, cv=5, scoring="r2", n_jobs=-1, min_features_to_select=5)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature names
selected_features = X_train.columns[selector.support_]
logging.info(f"Selected features: {selected_features}")

X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)

print("\n Selected Important Features:")
print(selected_features)

start_time = time.time()
rf_model.fit(X_train_selected, y_train)
rf_time = time.time() - start_time

rf_y_pred = rf_model.predict(X_test_selected)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

# Uncomment to compare forest regression and gradient boosting (Forest regression was best)
# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

start_time = time.time()
gb_model.fit(X_train_selected, y_train)
gb_time = time.time() - start_time

gb_y_pred = gb_model.predict(X_test_selected)
gb_mae = mean_absolute_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)

#print(f"Training is complete")
#print(f"MAE: {mae}")
#print(f"R2: {r2}")

print(f"Random Forest - MAE: {rf_mae:.4f}, R²: {rf_r2:.4f}, Training Time: {rf_time:.2f} sec")
print(f"Gradient Boosting - MAE: {gb_mae:.4f}, R²: {gb_r2:.4f}, Training Time: {gb_time:.2f} sec")

# Save the Best Model
if rf_r2 > gb_r2:
    joblib.dump(rf_model, "3PP_best_model.pkl")
    logging.info("Best Model (Random Forest) saved as '3PP_best_model.pkl'")
else:
    joblib.dump(gb_model, "3PP_best_model.pkl")
    logging.info("Best Model (Gradient Boosting) saved as '3PP_best_model.pkl'")

# Uncomment these after testing Forest and boosting
#joblib.dump(scaler, "scaler.pkl")
#joblib.dump(model, "model.pkl")

#logging.info(f"Model saved: {model}")
print(f"Training is complete.")

2025-03-26 19:12:12,009 - INFO - Dataset loaded: nba_historical_stats.csv
2025-03-26 19:12:12,019 - INFO - Team totals separated from player data.
2025-03-26 19:12:12,022 - INFO - Columns removed: ['Age', 'Rk', 'Player', 'TEAM']
2025-03-26 19:12:12,087 - INFO - Missing values handled using median strategy.
2025-03-26 19:12:12,104 - INFO - X_train shape: (16531, 20)
2025-03-26 19:14:38,994 - INFO - Selected features: Index(['GS', 'MP', 'FGA', 'FG%', '3P', '3PA', 'FTA', 'FT%', 'DRB', 'TRB',
       'AST', 'STL', 'TOV', 'PF', 'YR'],
      dtype='object')



 Selected Important Features:
Index(['GS', 'MP', 'FGA', 'FG%', '3P', '3PA', 'FTA', 'FT%', 'DRB', 'TRB',
       'AST', 'STL', 'TOV', 'PF', 'YR'],
      dtype='object')


2025-03-26 19:15:10,556 - INFO - Best Model (Random Forest) saved as '3PP_best_model.pkl'


Random Forest - MAE: 0.0515, R²: 0.6387, Training Time: 17.94 sec
Gradient Boosting - MAE: 0.0528, R²: 0.6361, Training Time: 13.38 sec
Training is complete.


In [8]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import logging

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

def predict_3pp_for_new_season(new_data_path, historical_data_path, player_names_included=True):
    """
    Predict 3P% for NBA players in the 2025-26 season without relying on saved scaler.
    
    Parameters:
    new_data_path (str): Path to the CSV file containing new player stats
    historical_data_path (str): Path to the CSV file containing historical stats used for training
    player_names_included (bool): Whether the input CSV contains player names
    
    Returns:
    DataFrame: DataFrame with player names and predicted 3P%
    """
    # load the model that we got from the training program 
    logging.info("Loading trained model")
    best_model = joblib.load("3PP_best_model.pkl")
    
    # get the selected features by looking at the model's feature names
    try:
        selected_features = best_model.feature_names_in_
        logging.info(f"Retrieved {len(selected_features)} features from model")
    except AttributeError:
        # ff feature names aren't stored in the model, we need to reconstruct the feature selection process
        logging.warning("Model doesn't contain feature names, using hardcoded selected features")
        # replace with the actual features printed during training
        selected_features = [
            'GS', 'FG', 'FGA', 'FG%', '3P', '3PA', 'ORB', 'AST', 'PF', 'YR'
        ]
    
    # load historical data to recreate the scaler
    logging.info(f"Loading historical data from {historical_data_path}")
    try:
        historical_df = pd.read_csv(historical_data_path)
    except FileNotFoundError:
        logging.error(f"Historical dataset not found: {historical_data_path}")
        raise SystemExit
    
    # store target column separately
    target_column = "3P%"
    
    # filter out team totals as we did in the training_program
    historical_df = historical_df[historical_df["Player"] != "Team Totals"].copy()
    
    # remove columns that we do not need for prediction
    remove_columns = ["Age", "Rk", "Player", "TEAM"]
    historical_df.drop(columns=remove_columns, inplace=True, errors='ignore')
    
    # drop the target column (3P%) before preprocessing
    x_historical = historical_df.drop(columns=[target_column])
    
    # recreate the imputer and scaler on historical data
    logging.info("Recreating preprocessing steps from training")
    numeric_features = x_historical.select_dtypes(include=np.number).columns
    imputer = SimpleImputer(strategy="median")
    x_historical[numeric_features] = imputer.fit_transform(x_historical[numeric_features])
    
    # recreate the scaler
    scaler = StandardScaler()
    scaler.fit(x_historical)
    logging.info("Scaler recreated with historical data")
    
    # load and prepare new player data
    logging.info(f"Loading new player data from {new_data_path}")
    try:
        new_players_df = pd.read_csv(new_data_path)
    except FileNotFoundError:
        logging.error(f"File not found: {new_data_path}")
        raise SystemExit
    
    # store player names if included in the input
    if player_names_included:
        player_names = new_players_df["Player"].copy()
        team_names = new_players_df["TEAM"].copy() if "TEAM" in new_players_df.columns else None
    
    # remove columns not used in training
    new_players_df.drop(columns=remove_columns, inplace=True, errors='ignore')
    
    # remove the target column from new data if it exists
    if target_column in new_players_df.columns:
        logging.info(f"Removing {target_column} column from new data as it's the prediction target")
        new_players_df.drop(columns=[target_column], inplace=True)
    
    # ensure the new data has exactly the same columns as the training data
    missing_cols = set(x_historical.columns) - set(new_players_df.columns)
    if missing_cols:
        logging.warning(f"Missing columns in new data: {missing_cols}")
        for col in missing_cols:
            new_players_df[col] = 0  # Fill with zeros as placeholder
    
    # ensure columns are in the same order
    new_players_df = new_players_df[x_historical.columns]
    
    # handle missing values
    numeric_features = new_players_df.select_dtypes(include=np.number).columns
    new_players_df[numeric_features] = imputer.transform(new_players_df[numeric_features])
    
    # transform and select features
    logging.info("Scaling features and selecting relevant ones")
    new_players_scaled = pd.DataFrame(
        scaler.transform(new_players_df), 
        columns=new_players_df.columns
    )
    
    # ensure only selected features are used
    if set(selected_features).issubset(set(new_players_scaled.columns)):
        new_players_selected = new_players_scaled[selected_features]
    else:
        logging.error(f"Selected features {selected_features} not found in scaled data columns: {new_players_scaled.columns}")
        raise ValueError("Feature mismatch between model and input data")
    
    # make predictions
    logging.info("Making predictions")
    predicted_3pp = best_model.predict(new_players_selected)
    
    # create results dataframe
    if player_names_included:
        results = {
            "Player": player_names,
            "Predicted_3P%": predicted_3pp
        }
        if team_names is not None:
            results["TEAM"] = team_names
        
        results_df = pd.DataFrame(results)
    else:
        results_df = pd.DataFrame({"Predicted_3P%": predicted_3pp})
    
    logging.info("Prediction complete")
    return results_df

if __name__ == "__main__":
    historical_data_path = "nba_historical_stats.csv"  # same file used for training
    new_data_path = "nba_player_stats_nba_api_2024-25.csv"  # data for predictions
    
    predictions = predict_3pp_for_new_season(new_data_path, historical_data_path)
    
    # save predictions
    predictions.to_csv("predicted_3pp_2025-26.csv", index=False)
    
    print("\nTop 10 players by predicted 3P%:")
    print(predictions.sort_values("Predicted_3P%", ascending=False).head(10))

2025-03-26 19:42:14,283 - INFO - Loading trained model
2025-03-26 19:42:14,400 - INFO - Retrieved 15 features from model
2025-03-26 19:42:14,401 - INFO - Loading historical data from nba_historical_stats.csv
2025-03-26 19:42:14,515 - INFO - Recreating preprocessing steps from training
2025-03-26 19:42:14,588 - INFO - Scaler recreated with historical data
2025-03-26 19:42:14,589 - INFO - Loading new player data from nba_player_stats_nba_api_2024-25.csv
2025-03-26 19:42:14,596 - INFO - Removing 3P% column from new data as it's the prediction target
2025-03-26 19:42:14,603 - INFO - Scaling features and selecting relevant ones
2025-03-26 19:42:14,607 - INFO - Making predictions
2025-03-26 19:42:14,656 - INFO - Prediction complete



Top 10 players by predicted 3P%:
                  Player  Predicted_3P% TEAM
507         Tony Bradley       0.985000  IND
18      Alondes Williams       0.979495  DET
485      Skal Labissiere       0.961005  SAC
433            PJ Dozier       0.573564  MIN
14            Alex Ducas       0.520999  OKC
388        Maxwell Lewis       0.516533  BKN
377       Markelle Fultz       0.511609  SAC
440  Patrick Baldwin Jr.       0.507365  LAC
483         Sidy Cissoko       0.503255  POR
155            Dru Smith       0.492179  MIA
