In [2]:
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
import pandas as pd
import numpy as np


1) DATA COLLECTION

In [3]:
# Function for getting all regular season games from specified season 
def download_season(season):
    print(f"Downloading season {season}...")
    games = leaguegamefinder.LeagueGameFinder(
            season_nullable=season,
            season_type_nullable='Regular Season'
        )
    df = games.league_game_finder_results.get_data_frame()
    return df

# Note: Train games must be games held before test games otherwise it doesn't make sense to 
# learn on future games and then predict on the past games
train_games_df = pd.concat([download_season('2022-23'), 
                            download_season('2023-24')], 
                            ignore_index=True)
test_games_df = download_season('2024-25')

print(f'Downloaded {len(train_games_df)} train games and {len(test_games_df)} test games!')

train_games_df.head()

Downloading season 2022-23...
Downloading season 2023-24...
Downloading season 2024-25...
Downloaded 4920 train games and 2460 test games!


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22022,1610612739,CLE,Cleveland Cavaliers,22201218,2023-04-09,CLE vs. CHA,L,241,95,...,0.938,11,33,44,27,9,5,16,24,-11.0
1,22022,1610612763,MEM,Memphis Grizzlies,22201226,2023-04-09,MEM @ OKC,L,241,100,...,0.722,11,32,43,25,8,4,12,16,-15.0
2,22022,1610612743,DEN,Denver Nuggets,22201227,2023-04-09,DEN vs. SAC,W,240,109,...,0.72,15,36,51,25,11,2,16,15,14.0
3,22022,1610612748,MIA,Miami Heat,22201219,2023-04-09,MIA vs. ORL,W,241,123,...,0.75,7,37,44,30,10,3,18,20,13.0
4,22022,1610612752,NYK,New York Knicks,22201220,2023-04-09,NYK vs. IND,L,241,136,...,0.773,19,34,53,29,8,8,15,24,-5.0


2. FEATURE ENGINEERING

In [4]:

def adjust_game_df(games_df):
    # Adding binary "WIN" column 
    games_df["WIN"] = games_df["WL"].apply(lambda x: 1 if x == 'W' else 0)

    # Adding "HGA" (Home game advantage) column
    games_df["HGA"] = games_df["MATCHUP"].apply(lambda x: 1 if 'vs' in x else 0)

    # Converting int stat columns to float type
    int_columns = ['MIN','PTS','FGM','FGA','FG3M','FG3A','FTM','FTA','OREB','DREB','REB','AST','STL','BLK','TOV','PF','HGA']
    games_df[int_columns] = games_df[int_columns].astype(float)

    # Convert game date to pandas datetime
    games_df["GAME_DATE"] = pd.to_datetime(games_df["GAME_DATE"])

    # Sorting games by game_date
    games_df = games_df.sort_values(by='GAME_DATE').reset_index(drop=True)

    return games_df

train_games_df = adjust_game_df(train_games_df)
test_games_df = adjust_game_df(test_games_df)

train_games_df


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,WIN,HGA
0,22022,1610612738,BOS,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,240.0,126.0,...,30.0,36.0,24.0,8.0,3.0,10.0,24.0,9.0,1,1.0
1,22022,1610612755,PHI,Philadelphia 76ers,0022200001,2022-10-18,PHI @ BOS,L,239.0,117.0,...,27.0,31.0,16.0,8.0,3.0,14.0,25.0,-9.0,0,0.0
2,22022,1610612747,LAL,Los Angeles Lakers,0022200002,2022-10-18,LAL @ GSW,L,241.0,109.0,...,39.0,48.0,23.0,12.0,4.0,21.0,18.0,-14.0,0,0.0
3,22022,1610612744,GSW,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,241.0,123.0,...,37.0,48.0,31.0,11.0,4.0,18.0,23.0,14.0,1,1.0
4,22022,1610612758,SAC,Sacramento Kings,0022200014,2022-10-19,SAC vs. POR,L,241.0,108.0,...,37.0,41.0,27.0,8.0,5.0,15.0,25.0,-7.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,22023,1610612756,PHX,Phoenix Suns,0022301194,2024-04-14,PHX @ MIN,W,244.0,125.0,...,19.0,32.0,28.0,13.0,3.0,12.0,24.0,19.0,1,0.0
4916,22023,1610612751,BKN,Brooklyn Nets,0022301192,2024-04-14,BKN @ PHI,L,240.0,86.0,...,38.0,42.0,19.0,8.0,6.0,12.0,14.0,-21.0,0,0.0
4917,22023,1610612753,ORL,Orlando Magic,0022301191,2024-04-14,ORL vs. MIL,W,240.0,113.0,...,36.0,43.0,27.0,11.0,5.0,11.0,18.0,25.0,1,1.0
4918,22023,1610612742,DAL,Dallas Mavericks,0022301196,2024-04-14,DAL @ OKC,L,241.0,86.0,...,29.0,41.0,19.0,8.0,2.0,14.0,15.0,-49.0,0,0.0


In [None]:
# Function that returns label and list of features for specific game in this order:
#  label, [win_rate_diff, HGA, avg_points_diff, fg3_percentage_diff]
def extract_features(game, all_games, n_matches = 10):

    team_id = game["TEAM_ID"]
    opponent_id = get_opponent_id(game)
    game_date = game["GAME_DATE"]

    # Find last n games played by both teams
    team_recent_games = all_games[(all_games['TEAM_ID'] == team_id) & (all_games['GAME_DATE'] < game_date)].tail(n_matches)
    opponent_recent_games = all_games[(all_games['TEAM_ID'] == opponent_id) & (all_games['GAME_DATE'] < game_date)].tail(n_matches)

    # If there isn't enough information skip
    if len(team_recent_games) < n_matches or len(opponent_recent_games) < n_matches:
        return None
    
    # FEATURE 1 - Win rate difference
    team_win_rate = team_recent_games["WIN"].mean()
    opponent_win_rate = opponent_recent_games["WIN"].mean()

    win_rate_diff = team_win_rate - opponent_win_rate

    # FEATURE 2 - Home advantage
    home_advantage = game["HGA"]

    # FEATURE 3 - Average points difference
    team_avg_points = team_recent_games["PTS"].mean()
    opponent_avg_points = opponent_recent_games["PTS"].mean()

    avg_points_diff = team_avg_points - opponent_avg_points

    # FEATURE 4 - Three point percentage difference
    team_fg3_percentage = team_recent_games["FG3M"].sum() / team_recent_games["FG3A"].sum() 
    opponent_fg3_percentage = opponent_recent_games["FG3M"].sum() / opponent_recent_games["FG3A"].sum()

    fg3_percentage_diff = team_fg3_percentage - opponent_fg3_percentage

    # FEATURE 5 - Average blocks made difference
    team_avg_blocks = team_recent_games["BLK"].mean()
    opponent_avg_blocks = opponent_recent_games["BLK"].mean()

    avg_blocks_diff = team_avg_blocks - opponent_avg_blocks

    # Nadodati mozda povijest izmedu timova head to head statistika ili jos neke feature, takoder isprobati ove feature u sporedbi s nekim drugim
    # Razlika u danima od zadnje odigrane utakmice (ako imaju manje dana odmora, losije igraju)
    # Slobodno s gptem smislite ima li smisla jos kojih feature i dodajte pa isprobajte tocnost baseline modela dolje

    # features = [win_rate_diff, home_advantage, avg_points_diff,
    #             fg3_percentage_diff, avg_blocks_diff]
    features = [win_rate_diff, home_advantage, avg_points_diff, fg3_percentage_diff, avg_blocks_diff]
    
    # LABEL
    label = game["WIN"]

    return label, features

# Function that returns opponent team id
def get_opponent_id(game):
    matchup = game["MATCHUP"]
    if 'vs' in matchup:
        opponent_abbreviation = matchup.split(' vs. ')[-1]
    else:
        opponent_abbreviation = matchup.split(' @ ')[-1]

    opponent_team = teams.find_team_by_abbreviation(opponent_abbreviation)
    return opponent_team["id"]

# Function that creates and returns dataset X and labels y
def create_datasets(games_df: pd.DataFrame, n_matches = 10):

    X = []
    y = []

    skipped = 0
    num_of_games = len(games_df)

    for idx, game in games_df.iterrows():
        result = extract_features(game, games_df, n_matches)

        if result is None:
            skipped += 1
            continue

        label, features = result

        X.append(features)
        y.append(label)

        if (idx + 1) % 500 == 0:
            print(f"Processed {idx + 1}/{num_of_games} games...")

    print(f"Created dataset with {num_of_games - skipped} games.")
    print(f"Skipped {skipped} games (games with less than {n_matches} previous matches).\n")

    return np.array(X), np.array(y)

X_train, y_train = create_datasets(train_games_df)
X_test, y_test = create_datasets(test_games_df)




Processed 500/4920 games...
Processed 1000/4920 games...
Processed 1500/4920 games...
Processed 2000/4920 games...
Processed 2500/4920 games...
Processed 3000/4920 games...
Processed 3500/4920 games...
Processed 4000/4920 games...
Processed 4500/4920 games...
Created dataset with 4606 games.
Skipped 314 games (games with less than 10 previous matches).

Processed 500/2460 games...
Processed 1000/2460 games...
Processed 1500/2460 games...
Processed 2000/2460 games...
Created dataset with 2150 games.
Skipped 310 games (games with less than 10 previous matches).



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Ovo je samo baseline model za koji gpt kaze da je dobar za ovakvu klasifikaciju ii jos nam daje probability (dobar za ispitivanje feature-a)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression().fit(X_train_scaled, y_train)

predictions = log_reg.predict(X_test_scaled)
prediction_prob = log_reg.predict_proba(X_test_scaled)

# 0 izgubio tim, 1 pobijedio tim
# mecevi u nba_api su pohranjeni na nacin da za jedan tim pisu njihove statistike u igri i pise samo opponent name, zato
# trebat cemo ukloniti duplicirane utakmice, jer ovako cim jedan game krivo predictamo, predictamo i onaj drugi krivo (povecala bi nam se tocnost)
df = pd.DataFrame({'y_test': y_test, 'predicted': predictions, 'winning probability': prediction_prob[:,1]})

print(f"Train Accuracy = {accuracy_score(y_train, log_reg.predict(X_train_scaled))}")
print(f"Test Accuracy = {accuracy_score(y_test, predictions)}")
df



Train Accuracy = 0.611376465479809
Test Accuracy = 0.6246511627906977


Unnamed: 0,y_test,predicted,winning probability
0,0,0,0.413945
1,1,1,0.586055
2,0,0,0.495544
3,1,1,0.504456
4,0,0,0.365512
...,...,...,...
2145,1,0,0.366847
2146,0,1,0.561251
2147,1,1,0.719458
2148,1,0,0.441804
