## Import Packages

In [231]:
import requests
import json
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, f1_score, brier_score_loss, log_loss
from sklearn.preprocessing import StandardScaler
import time
import fitz  # PyMuPDF
from datetime import datetime
import random
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from imblearn.over_sampling import SMOTE
import re
import pickle

## Sports Reference Advanced  Stats Data Collection

In [232]:
# Define the URL
url = "https://www.basketball-reference.com/leagues/NBA_2025_ratings.html"

# Set headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
}

# Request the webpage
response = requests.get(url, headers=headers)
if response.status_code != 200:
    print(f"Failed to retrieve page. Status Code: {response.status_code}")
else:
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table with team ratings
    table = soup.find("table", {"id": "ratings"})

    if table:
        # Extract headers
        headers = [th.text for th in table.find("thead").find_all("th")]
       
        # Extract data rows
        data = []
        rows = table.find("tbody").find_all("tr")
       
        for row in rows:
            cols = row.find_all("td")
            if cols:
                team_name = cols[0].text.strip()  # Team name from the 'th' tag
                ortg = cols[7].text.strip()  # Offensive Rating
                drtg = cols[8].text.strip()  # Defensive Rating
                net_a = cols[13].text.strip()  # Adjusted Net Rating
                wins = cols[3].text.strip()  # Wins
                losses = cols[4].text.strip()  # Losses
                record = cols[5].text.strip()  # Win-Loss Percentage
               
                data.append([team_name, ortg, drtg, net_a, wins, losses, record])

        # Convert to DataFrame
        df = pd.DataFrame(data, columns=["Team", "Offensive Rating", "Defensive Rating", "Adjusted Net Rating", "Wins", "Losses", "Win Percentage"])
       
        # Save to CSV
        df.to_csv("nba_rates.csv", index=False)
        print(df.head())
    else:
        print("Failed to find the stats table.")

                     Team Offensive Rating Defensive Rating  \
0   Oklahoma City Thunder           120.23           107.77   
1     Cleveland Cavaliers           122.41           112.48   
2          Boston Celtics           120.93           111.73   
3         Houston Rockets           115.39           110.38   
4  Minnesota Timberwolves           116.31           111.62   

  Adjusted Net Rating Wins Losses Win Percentage  
0               12.52   66     14           .825  
1                9.08   63     16           .797  
2                8.58   59     21           .738  
3                5.40   52     28           .650  
4                4.98   46     33           .582  


## Games Web Scrape

In [233]:
base_url = "https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"
months = ['october', 'november', 'december', 'january', 'february', 'march']
years = [2025, 2024, 2023, 2022]  # Include the current and 3 previous seasons
all_games_data = []

for year in years:
    for month in months:
        url = base_url.format(year=year, month=month)
        print(f"Scraping data from: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        schedule_table = soup.find('table', id='schedule')

        if schedule_table:
            tbody = schedule_table.find('tbody')
            if tbody:
                rows = tbody.find_all('tr')
                for row in rows:
                    th_date = row.find('th', {'data-stat': 'date_game'})
                    if th_date and th_date.a:  # Ensure it's a game row and has a link
                        date_text = th_date.a.text
                        time_element = row.find('td', {'data-stat': 'game_start_time'})
                        away_team_element = row.find('td', {'data-stat': 'visitor_team_name'})
                        away_score_element = row.find('td', {'data-stat': 'visitor_pts'})
                        home_team_element = row.find('td', {'data-stat': 'home_team_name'})
                        home_score_element = row.find('td', {'data-stat': 'home_pts'})

                        if all([time_element, away_team_element, away_score_element, home_team_element, home_score_element]):
                            time = time_element.text
                            away_team = away_team_element.text
                            away_score = int(away_score_element.text)
                            home_team = home_team_element.text
                            home_score = int(home_score_element.text)

                            full_date = f"{date_text}, {year}"
                            final_score = f"{away_score}-{home_score}"

                            if away_score > home_score:
                                winning_team = away_team
                            elif home_score > away_score:
                                winning_team = home_team
                            else:
                                winning_team = "Tie"  # Handle potential ties (though rare in NBA)

                            game_info = {
                                'Date': full_date,
                                'Time': time,
                                'Home Team': home_team,
                                'Home Score': home_score,
                                'Away Team': away_team,
                                'Away Score': away_score,
                                'Final Score': final_score,
                                'Winner': winning_team  # Renamed to 'Team' as requested
                            }
                            all_games_data.append(game_info)
            else:
                print(f"  <tbody> not found on NBA_{year}_games-{month}.html")
        else:
            print(f"  Schedule table not found on NBA_{year}_games-{month}.html")

df = pd.DataFrame(all_games_data)
df.to_csv('nba_games.csv', index=False)

print("nba_games.csv created")

Scraping data from: https://www.basketball-reference.com/leagues/NBA_2025_games-october.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2025_games-november.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2025_games-december.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2025_games-january.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2025_games-february.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2025_games-march.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2024_games-october.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2024_games-november.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2024_games-december.html
Scraping data from: https://www.basketball-reference.com/leagues/NBA_2024_games-january.html
Scraping data from: https://www.basketball-reference.com/leagues/NB

## Team Stats Web Scrape

In [234]:
url = "https://www.basketball-reference.com/leagues/NBA_2025.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the "Per Game" table using the correct ID
per_game_table = soup.find('table', id='per_game-team')

if per_game_table:
    # Find the table header
    thead = per_game_table.find('thead')
    if thead:
        # Find the single header row
        header_row = thead.find('tr')
        # Find all header cells and extract their text
        all_headers = [th.text.strip() for th in header_row.find_all('th')]

        desired_columns_onsite = ['Team', 'PTS', 'FG%', 'TOV', 'TRB', 'FTA']
        extracted_headers = []
        header_indices = []

        # Find the indices of our desired columns in the full list of headers
        for col in desired_columns_onsite:
            try:
                index = all_headers.index(col)
                extracted_headers.append(col)
                header_indices.append(index)
            except ValueError:
                print(f"Warning: Column '{col}' not found in the table headers.")

        # Find all data rows in the table body
        body_rows = per_game_table.find('tbody').find_all('tr')

        data = []
        for row in body_rows:
            row_cells = row.find_all('td')
            if row_cells:
                row_data = []
                # Extract data based on the indices we found
                for index in header_indices:
                    # Adjust index by subtracting 1 because the first header 'Rk' doesn't have a corresponding td
                    data_index = index - 1
                    if 0 <= data_index < len(row_cells):
                        row_data.append(row_cells[data_index].text.strip())
                    else:
                        row_data.append(None) # Handle cases where data might be missing
                if len(row_data) == len(extracted_headers):
                    data.append(row_data)

        # Create a DataFrame with the extracted headers
        df = pd.DataFrame(data, columns=extracted_headers)

        # Clean up the DataFrame (optional, but good practice)
        df = df.dropna(subset=extracted_headers, how='all') # Drop rows with all NaN in selected columns

        # Save to CSV
        df.to_csv('nba_team_stats.csv', index=False)
        print("NBA 2025 team stats (Team, PTS, FG%, TOV, TRB, FTA) scraped and saved to csv")

    else:
        print("Table header (thead) not found.")

else:
    print("Per Game table not found on the page.")

NBA 2025 team stats (Team, PTS, FG%, TOV, TRB, FTA) scraped and saved to csv


## Odds API Data Collection 

In [235]:
# d51c836cb7e7847d45faaeda53964a45 - API Key for Odds API

# Replace 'YOUR_API_KEY' with your actual API key
API_KEY = 'YOUR_API_KEY'
SPORT = 'basketball_nba'
REGION = 'us'  # Use 'us' for American odds format

# List of markets to pull
markets = ["h2h", "spreads", "totals"]

all_data = []

for market in markets:
    url = f'https://api.the-odds-api.com/v4/sports/{SPORT}/odds/?apiKey={API_KEY}&regions={REGION}&markets={market}'
    response = requests.get(url)

    if response.status_code == 200:
        odds_data = response.json()
        
        for game in odds_data:
            home_team = game["home_team"]
            away_team = game["away_team"]
            commence_time = game["commence_time"]
            
            for bookmaker in game["bookmakers"]:
                bookmaker_name = bookmaker["title"]
                
                for market_data in bookmaker["markets"]:
                    market_type = market_data["key"]  # "h2h", "spreads", or "totals"
                    
                    for outcome in market_data["outcomes"]:
                        team = outcome.get("name", "N/A")
                        price = outcome.get("price", "N/A")
                        point = outcome.get("point", "N/A")  # Only applies to spreads and totals

                        all_data.append([home_team, away_team, commence_time, bookmaker_name, market_type, team, price, point])

    else:
        print(f"Error {response.status_code}: {response.text}")

# Convert to DataFrame
df = pd.DataFrame(all_data, columns=["Home Team", "Away Team", "Commence Time", "Bookmaker", "Market", "Team", "Odds", "Point"])

df['Favorite'] = df['Team']

# Drop Commence Time and Team
ctd = ['Commence Time', 'Team']
df1 = df.drop(columns=ctd)

# Save to CSV
df1.to_csv("nba_odds.csv", index=False)
print("NBA odds data saved to nba_odds.csv")

NBA odds data saved to nba_odds.csv


# Merge Data Together into 1 CSV

In [236]:
# 1. Load datasets (as before)
#injury_df = pd.read_csv('NBA_Injury_Report.csv')
advanced_df = pd.read_csv('nba_rates.csv')
team_stats_df = pd.read_csv('nba_team_stats.csv')
games_df = pd.read_csv('nba_games.csv')

# Merge the games and team stats for the home teams
merged_df = pd.merge(games_df, team_stats_df, left_on='Home Team', right_on='Team', suffixes=('', '_home'), how='left')
merged_df.drop(columns=['Team'], inplace=True)
# print("\nDataFrame after merging home team stats:")
# print(merged_df.head())

# Merge the games and team stats for away teams
merged_df = pd.merge(merged_df, team_stats_df, left_on='Away Team', right_on='Team', suffixes=('_home', '_away'), how='left')
merged_df.drop(columns=['Team'], inplace=True)
# print("\nDataFrame after merging away team stats:")
# print(merged_df.head())

# Merge metrics for Home team
merged_df = pd.merge(merged_df, advanced_df, left_on='Home Team', right_on='Team', suffixes=('', '_home_adv'))
merged_df.drop(columns=['Team'], inplace=True) # Drop the redundant 'Team' column
# print("\nDataFrame after merging home team advanced metrics:")
# print(merged_df.head())

# Merge metrics for Away Team
merged_df = pd.merge(merged_df, advanced_df, left_on='Away Team', right_on='Team', suffixes=('_home_adv', '_away_adv'))
merged_df.drop(columns=['Team'], inplace=True) # Drop the redundant 'Team' column
# print("\nFinal Merged DataFrame:")
# print(merged_df.head())

def is_home_win(row):
    if row['Winner'] == row['Home Team']:
        return 1
    else:
        return 0

merged_df['Home_Win'] = merged_df.apply(is_home_win, axis=1)

# 'Home_Win' column will have:
# 1 if the home team won
# 0 if the away team won

merged_df['point_diff'] = merged_df['Home Score'] - merged_df['Away Score']


mdf = merged_df.to_csv('merged_nba.csv')
print('Merge Complete')

Merge Complete


## Feature Selection and Model Building

In [237]:
# Load dataset
df = pd.read_csv('merged_nba.csv')

# Feature selection
features = ['PTS_home', 'PTS_away', 'TOV_home', 'TOV_away', 'TRB_home', 
            'TRB_away', 'FG%_home', 'FG%_away', 'FTA_home', 'FTA_away',
            'Offensive Rating_home_adv', 'Defensive Rating_home_adv',
            'Offensive Rating_away_adv', 'Defensive Rating_away_adv',
            'Adjusted Net Rating_home_adv', 'Adjusted Net Rating_away_adv',
            'Win Percentage_home_adv', 'Win Percentage_away_adv']
X = df[features]
y = df['Home_Win']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)
# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(xgb_model, param_grid, scoring='neg_log_loss',
                          cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best model
best_xgb = grid_search.best_estimator_

# Save the best XGBoost model to a file
model_filename = 'nba_win_predictor_model.pkl'
pickle.dump(best_xgb, open(model_filename, 'wb'))

print(f"Trained model saved as: {model_filename}")

# Predict probabilities
y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]

# Evaluate model
logloss = log_loss(y_test, y_pred_proba)
brier = brier_score_loss(y_test, y_pred_proba)

print(f'Log Loss: {logloss:.4f}')
print(f'Brier Score: {brier:.4f}')
print(classification_report(y_test, (y_pred_proba > 0.5).astype(int)))

# Feature Analysis using SHAP values
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(X_test)

# Generate the summary plot
plt.figure()  # Create a new figure
shap.summary_plot(shap_values, X_test, show=False)  # Important: set show=False

# Save the plot as a PDF
plt.savefig("shap_nba_plot.pdf")

# Optionally, close the plot to free up resources
plt.close()

print("SHAP summary plot saved as shap_summary_plot.pdf")

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Trained model saved as: nba_win_predictor_model.pkl
Log Loss: 0.6609
Brier Score: 0.2343
              precision    recall  f1-score   support

           0       0.55      0.58      0.56       406
           1       0.65      0.61      0.63       505

    accuracy                           0.60       911
   macro avg       0.60      0.60      0.60       911
weighted avg       0.60      0.60      0.60       911

SHAP summary plot saved as shap_summary_plot.pdf


## Automate Bankroll Strategy

In [238]:
# Load the trained XGBoost model
model_filename = 'nba_win_predictor_model.pkl'
try:
    with open(model_filename, 'rb') as file:
        model = pickle.load(file)
    print(f"\nTrained model loaded successfully from: {model_filename}")
except FileNotFoundError:
    print(f"Error: Model file not found at {model_filename}. Make sure you have trained and saved the model (run the train_model() function).")
    exit()

# Load the upcoming matchups and odds data
upcoming_df = pd.read_csv('nba_odds.csv')
print("\nUpcoming Matchup Data:")
print(upcoming_df.head())

def create_features(home_team, away_team, current_date):
    stats_df = pd.read_csv('merged_nba.csv') # Adjust your data source

    home_stats = stats_df[stats_df['Home Team'] == home_team].iloc[0]
    away_stats = stats_df[stats_df['Away Team'] == away_team].iloc[0]
    
    features = {
        'PTS_home': home_stats.get('PTS_home'),
        'PTS_away': away_stats.get('PTS_away'),
        'TOV_home': home_stats.get('TOV_home'),
        'TOV_away': away_stats.get('TOV_away'),
        'TRB_home': home_stats.get('TRB_home'),
        'TRB_away': away_stats.get('TRB_away'),
        'FG%_home': home_stats.get('FG%_home'),
        'FG%_away': away_stats.get('FG%_away'),
        'FTA_home': home_stats.get('FTA_home'),
        'FTA_away': away_stats.get('FTA_away'),
        'Offensive Rating_home_adv': home_stats.get('Offensive Rating_home_adv'),
        'Defensive Rating_home_adv': home_stats.get('Defensive Rating_home_adv'),
        'Offensive Rating_away_adv': away_stats.get('Offensive Rating_away_adv'),
        'Defensive Rating_away_adv': away_stats.get('Defensive Rating_away_adv'),
        'Adjusted Net Rating_home_adv': home_stats.get('Adjusted Net Rating_home_adv'),
        'Adjusted Net Rating_away_adv': away_stats.get('Adjusted Net Rating_away_adv'),
        'Win Percentage_home_adv': home_stats.get('Win Percentage_home_adv'),
        'Win Percentage_away_adv': away_stats.get('Win Percentage_away_adv')
    }
    return features

def calculate_ev(probability_win, decimal_odds):
    probability_lose = 1 - probability_win
    ev = (probability_win * (decimal_odds - 1)) - (probability_lose * 1)
    return ev

def calculate_half_kelly_fraction(probability_win, decimal_odds):
    q = 1 - probability_win
    if probability_win * decimal_odds > 1:
        kelly_fraction = probability_win - (q / decimal_odds)
        half_kelly = 0.5 * kelly_fraction
        return half_kelly
    else:
        return 0

def calculate_bet_size(bankroll, half_kelly_fraction):
    return bankroll * half_kelly_fraction

current_bankroll = 100
betting_opportunities = []

for index, row in upcoming_df.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    bookmaker = row['Bookmaker']
    market = row['Market']
    odds = row['Odds']
    point = row['Point']
    favorite = row['Favorite']
    matchup_date = datetime.now(tz=datetime.now().astimezone().tzinfo)

    if market == 'h2h':
        try:
            odds = float(odds)
            # Create features for the current matchup
            matchup_features = create_features(home_team, away_team, matchup_date)
            feature_df = pd.DataFrame([matchup_features])
            home_win_probability = model.predict_proba(feature_df)[0][1]  # Pass feature_df directly
            away_win_probability = 1 - home_win_probability

            if favorite == home_team:
                home_odds = odds
                away_odds_row = upcoming_df[(upcoming_df['Home Team'] == home_team) &
                                           (upcoming_df['Away Team'] == away_team) &
                                           (upcoming_df['Bookmaker'] == bookmaker) &
                                           (upcoming_df['Market'] == 'h2h') &
                                           (upcoming_df['Favorite'] == away_team)]
                away_odds = float(away_odds_row.iloc[0]['Odds']) if not away_odds_row.empty else None
                team_to_bet_on = 'Home Team Win'
                probability = home_win_probability
                bet_odds = home_odds
            elif favorite == away_team:
                away_odds = odds
                home_odds_row = upcoming_df[(upcoming_df['Home Team'] == home_team) &
                                           (upcoming_df['Away Team'] == away_team) &
                                           (upcoming_df['Bookmaker'] == bookmaker) &
                                           (upcoming_df['Market'] == 'h2h') &
                                           (upcoming_df['Favorite'] == home_team)]
                home_odds = float(home_odds_row.iloc[0]['Odds']) if not home_odds_row.empty else None
                team_to_bet_on = 'Away Team Win'
                probability = away_win_probability
                bet_odds = away_odds
            else:
                home_odds = None
                away_odds = None
                continue # Skip if no clear favorite for h2h

            if bet_odds is not None:
                ev = calculate_ev(probability, bet_odds)
                kelly_fraction = calculate_half_kelly_fraction(probability, bet_odds)
                if ev > 0 and kelly_fraction > 0:
                    bet_size = calculate_bet_size(current_bankroll, kelly_fraction)
                    betting_opportunities.append({
                        'Home Team': home_team,
                        'Away Team': away_team,
                        'Bet On': team_to_bet_on,
                        'Probability': probability,
                        'Odds': bet_odds,
                        'EV': ev,
                        'Kelly Fraction': kelly_fraction,
                        'Bet Size': bet_size,
                        'Bookmaker': bookmaker,
                        'Market': market
                    })

        except ValueError as e:
            print(f"Warning (h2h): Could not process odds for {home_team} vs {away_team} at {bookmaker}: {e}")

    elif market == 'spreads':
        try:
            odds = float(odds)
            point = float(point)

            if favorite == home_team:
                team_to_bet_on = f'Home Team Spread ({point})'
                implied_win_probability = 0.5 # Placeholder
            elif favorite == away_team:
                team_to_bet_on = f'Away Team Spread ({point})'
                implied_win_probability = 0.5 # Placeholder
            else:
                continue

            ev = calculate_ev(implied_win_probability, odds)
            kelly_fraction = calculate_half_kelly_fraction(implied_win_probability, odds)

            if ev > 0 and kelly_fraction > 0:
                bet_size = calculate_bet_size(current_bankroll, kelly_fraction)
                betting_opportunities.append({
                    'Home Team': home_team,
                    'Away Team': away_team,
                    'Bet On': team_to_bet_on,
                    'Probability': implied_win_probability,
                    'Odds': odds,
                    'Point': point,
                    'EV': ev,
                    'Kelly Fraction': kelly_fraction,
                    'Bet Size': bet_size,
                    'Bookmaker': bookmaker,
                    'Market': market
                })

        except ValueError as e:
            print(f"Warning (spreads): Could not process odds or point for {home_team} vs {away_team} at {bookmaker}: {e}")

betting_df = pd.DataFrame(betting_opportunities)
print("\nBetting Opportunities (Half Kelly Criterion):")
print(betting_df)
betting_df.to_csv('nba_picks.csv', index=False)


Trained model loaded successfully from: nba_win_predictor_model.pkl

Upcoming Matchup Data:
        Home Team            Away Team    Bookmaker Market  Odds  Point  \
0  Indiana Pacers  Cleveland Cavaliers   DraftKings    h2h  4.60    NaN   
1  Indiana Pacers  Cleveland Cavaliers   DraftKings    h2h  1.21    NaN   
2  Indiana Pacers  Cleveland Cavaliers  MyBookie.ag    h2h  4.58    NaN   
3  Indiana Pacers  Cleveland Cavaliers  MyBookie.ag    h2h  1.20    NaN   
4  Indiana Pacers  Cleveland Cavaliers      FanDuel    h2h  4.40    NaN   

              Favorite  
0  Cleveland Cavaliers  
1       Indiana Pacers  
2  Cleveland Cavaliers  
3       Indiana Pacers  
4  Cleveland Cavaliers  

Betting Opportunities (Half Kelly Criterion):
            Home Team               Away Team         Bet On  Probability  \
0      Indiana Pacers     Cleveland Cavaliers  Away Team Win     0.557650   
1      Indiana Pacers     Cleveland Cavaliers  Away Team Win     0.557650   
2      Indiana Pacers     Cl