<a href="https://colab.research.google.com/github/fran53759/nba_games_prediction/blob/modelVersions/neumreProjekt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
import pandas as pd
import numpy as np

In [4]:
# Function for getting all regular season games from specified season
def download_season(season):
    print(f"Downloading season {season}...")
    games = leaguegamefinder.LeagueGameFinder(
            season_nullable=season,
            season_type_nullable='Regular Season'
        )
    df = games.league_game_finder_results.get_data_frame()
    return df

# Note: Train games must be games held before test games otherwise it doesn't make sense to
# learn on future games and then predict on the past games
train_games_df = pd.concat([download_season('2022-23'),
                            download_season('2023-24')],
                            ignore_index=True)
test_games_df = download_season('2024-25')

print(f'Downloaded {len(train_games_df)} train games and {len(test_games_df)} test games!')

train_games_df.head()

Downloading season 2022-23...
Downloading season 2023-24...
Downloading season 2024-25...
Downloaded 4920 train games and 2460 test games!


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22022,1610612739,CLE,Cleveland Cavaliers,22201218,2023-04-09,CLE vs. CHA,L,241,95,...,0.938,11,33,44,27,9,5,16,24,-11.0
1,22022,1610612763,MEM,Memphis Grizzlies,22201226,2023-04-09,MEM @ OKC,L,241,100,...,0.722,11,32,43,25,8,4,12,16,-15.0
2,22022,1610612743,DEN,Denver Nuggets,22201227,2023-04-09,DEN vs. SAC,W,240,109,...,0.72,15,36,51,25,11,2,16,15,14.0
3,22022,1610612748,MIA,Miami Heat,22201219,2023-04-09,MIA vs. ORL,W,241,123,...,0.75,7,37,44,30,10,3,18,20,13.0
4,22022,1610612752,NYK,New York Knicks,22201220,2023-04-09,NYK vs. IND,L,241,136,...,0.773,19,34,53,29,8,8,15,24,-5.0


In [5]:
def adjust_game_df(games_df):
    # Adding binary "WIN" column
    games_df["WIN"] = games_df["WL"].apply(lambda x: 1 if x == 'W' else 0)

    # Adding "HGA" (Home game advantage) column
    games_df["HGA"] = games_df["MATCHUP"].apply(lambda x: 1 if 'vs' in x else 0)

    # Converting int stat columns to float type
    int_columns = ['MIN','PTS','FGM','FGA','FG3M','FG3A','FTM','FTA','OREB','DREB','REB','AST','STL','BLK','TOV','PF','HGA']
    games_df[int_columns] = games_df[int_columns].astype(float)

    # Convert game date to pandas datetime
    games_df["GAME_DATE"] = pd.to_datetime(games_df["GAME_DATE"])

    # Sorting games by game_date
    games_df = games_df.sort_values(by='GAME_DATE').reset_index(drop=True)

    return games_df

train_games_df = adjust_game_df(train_games_df)
test_games_df = adjust_game_df(test_games_df)

train_games_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,WIN,HGA
0,22022,1610612738,BOS,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,240.0,126.0,...,30.0,36.0,24.0,8.0,3.0,10.0,24.0,9.0,1,1.0
1,22022,1610612755,PHI,Philadelphia 76ers,0022200001,2022-10-18,PHI @ BOS,L,239.0,117.0,...,27.0,31.0,16.0,8.0,3.0,14.0,25.0,-9.0,0,0.0
2,22022,1610612747,LAL,Los Angeles Lakers,0022200002,2022-10-18,LAL @ GSW,L,241.0,109.0,...,39.0,48.0,23.0,12.0,4.0,21.0,18.0,-14.0,0,0.0
3,22022,1610612744,GSW,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,241.0,123.0,...,37.0,48.0,31.0,11.0,4.0,18.0,23.0,14.0,1,1.0
4,22022,1610612758,SAC,Sacramento Kings,0022200014,2022-10-19,SAC vs. POR,L,241.0,108.0,...,37.0,41.0,27.0,8.0,5.0,15.0,25.0,-7.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,22023,1610612751,BKN,Brooklyn Nets,0022301192,2024-04-14,BKN @ PHI,L,240.0,86.0,...,38.0,42.0,19.0,8.0,6.0,12.0,14.0,-21.0,0,0.0
4916,22023,1610612759,SAS,San Antonio Spurs,0022301197,2024-04-14,SAS vs. DET,W,241.0,123.0,...,42.0,54.0,33.0,9.0,3.0,13.0,17.0,28.0,1,1.0
4917,22023,1610612752,NYK,New York Knicks,0022301190,2024-04-14,NYK vs. CHI,W,265.0,120.0,...,37.0,53.0,27.0,7.0,6.0,21.0,17.0,1.0,1,1.0
4918,22023,1610612745,HOU,Houston Rockets,0022301199,2024-04-14,HOU @ LAC,W,240.0,116.0,...,42.0,59.0,31.0,7.0,8.0,18.0,13.0,11.0,1,0.0


In [6]:
# Function that returns label and list of features for specific game in this order:
#  label, [win_rate_diff, HGA, avg_points_diff, fg3_percentage_diff]
def extract_features(game, all_games, n_matches = 10):

    team_id = game["TEAM_ID"]
    opponent_id = get_opponent_id(game)
    game_date = game["GAME_DATE"]

    # Find last n games played by both teams
    team_recent_games = all_games[(all_games['TEAM_ID'] == team_id) & (all_games['GAME_DATE'] < game_date)].tail(n_matches)
    opponent_recent_games = all_games[(all_games['TEAM_ID'] == opponent_id) & (all_games['GAME_DATE'] < game_date)].tail(n_matches)

    # If there isn't enough information skip
    if len(team_recent_games) < n_matches or len(opponent_recent_games) < n_matches:
        return None

    # FEATURE 1 - Win rate difference
    team_win_rate = team_recent_games["WIN"].mean()
    opponent_win_rate = opponent_recent_games["WIN"].mean()

    win_rate_diff = team_win_rate - opponent_win_rate

    # FEATURE 2 - Home advantage
    home_advantage = game["HGA"]

    # FEATURE 3 - Average points difference
    team_avg_points = team_recent_games["PTS"].mean()
    opponent_avg_points = opponent_recent_games["PTS"].mean()

    avg_points_diff = team_avg_points - opponent_avg_points

    # FEATURE 4 - Three point percentage difference
    team_fg3_percentage = team_recent_games["FG3M"].sum() / team_recent_games["FG3A"].sum()
    opponent_fg3_percentage = opponent_recent_games["FG3M"].sum() / opponent_recent_games["FG3A"].sum()

    fg3_percentage_diff = team_fg3_percentage - opponent_fg3_percentage

    # FEATURE 5 - Average blocks made difference
    team_avg_blocks = team_recent_games["BLK"].mean()
    opponent_avg_blocks = opponent_recent_games["BLK"].mean()

    avg_blocks_diff = team_avg_blocks - opponent_avg_blocks

    # Nadodati mozda povijest izmedu timova head to head statistika ili jos neke feature, takoder isprobati ove feature u sporedbi s nekim drugim
    # Razlika u danima od zadnje odigrane utakmice (ako imaju manje dana odmora, losije igraju)
    # Slobodno s gptem smislite ima li smisla jos kojih feature i dodajte pa isprobajte tocnost baseline modela dolje

    # features = [win_rate_diff, home_advantage, avg_points_diff,
    #             fg3_percentage_diff, avg_blocks_diff]
    features = [win_rate_diff, home_advantage, avg_points_diff, fg3_percentage_diff, avg_blocks_diff]

    # LABEL
    label = game["WIN"]

    return label, features

# Function that returns opponent team id
def get_opponent_id(game):
    matchup = game["MATCHUP"]
    if 'vs' in matchup:
        opponent_abbreviation = matchup.split(' vs. ')[-1]
    else:
        opponent_abbreviation = matchup.split(' @ ')[-1]

    opponent_team = teams.find_team_by_abbreviation(opponent_abbreviation)
    return opponent_team["id"]

# Function that creates and returns dataset X and labels y
def create_datasets(games_df: pd.DataFrame, n_matches = 10):

    X = []
    y = []

    skipped = 0
    num_of_games = len(games_df)

    for idx, game in games_df.iterrows():
        result = extract_features(game, games_df, n_matches)

        if result is None:
            skipped += 1
            continue

        label, features = result

        X.append(features)
        y.append(label)

        if (idx + 1) % 500 == 0:
            print(f"Processed {idx + 1}/{num_of_games} games...")

    print(f"Created dataset with {num_of_games - skipped} games.")
    print(f"Skipped {skipped} games (games with less than {n_matches} previous matches).\n")

    return np.array(X), np.array(y)

X_train, y_train = create_datasets(train_games_df)
X_test, y_test = create_datasets(test_games_df)

Processed 500/4920 games...
Processed 1000/4920 games...
Processed 1500/4920 games...
Processed 2000/4920 games...
Processed 2500/4920 games...
Processed 3000/4920 games...
Processed 3500/4920 games...
Processed 4000/4920 games...
Processed 4500/4920 games...
Created dataset with 4606 games.
Skipped 314 games (games with less than 10 previous matches).

Processed 500/2460 games...
Processed 1000/2460 games...
Processed 1500/2460 games...
Processed 2000/2460 games...
Created dataset with 2150 games.
Skipped 310 games (games with less than 10 previous matches).



In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

predictions_lr = log_reg.predict(X_test_scaled)
prediction_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

print(f"LR Train Accuracy = {accuracy_score(y_train, log_reg.predict(X_train_scaled))}")
print(f"LR Test Accuracy  = {accuracy_score(y_test, predictions_lr)}")

df_lr = pd.DataFrame({
    "y_test": y_test,
    "predicted": predictions_lr,
    "winning_probability": prediction_prob_lr
})
display(df_lr.head())

# MLP

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, log_loss

def _get_activation(name: str) -> nn.Module:
    name = name.lower()
    if name == "relu":
        return nn.ReLU()
    if name == "leakyrelu":
        return nn.LeakyReLU(0.01)
    if name == "tanh":
        return nn.Tanh()
    if name == "sigmoid":
        return nn.Sigmoid()
    raise ValueError(f"Nepoznata aktivacija: {name} (probaj: relu, leakyrelu, tanh, sigmoid)")

def _get_optimizer(name: str, params, lr: float, weight_decay: float):
    name = name.lower()
    if name == "adam":
        return torch.optim.Adam(params, lr=lr, weight_decay=weight_decay)
    if name == "sgd":
        return torch.optim.SGD(params, lr=lr, momentum=0.9, weight_decay=weight_decay)
    if name == "adamw":
        return torch.optim.AdamW(params, lr=lr, weight_decay=weight_decay)
    raise ValueError(f"Nepoznat optimizer: {name} (probaj: adam, sgd, adamw)")

class MLP(nn.Module):
    def __init__(self, input_dim: int, hidden_layers=(32, 16), activation="relu", dropout=0.2):
        super().__init__()
        act = _get_activation(activation)
        layers = []
        prev = input_dim

        for h in hidden_layers:
            layers.append(nn.Linear(prev, h))
            layers.append(act.__class__())
            if dropout and dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev = h

        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

def train_mlp(
    X_train_scaled, y_train,
    X_test_scaled,  y_test,
    hidden_layers=(32, 16),
    activation="relu",
    optimizer_name="adam",
    lr=1e-3,
    weight_decay=1e-4,
    dropout=0.2,
    batch_size=64,
    epochs=50,
    seed=0,
    threshold=0.5,
    verbose_every=10,
):

    torch.manual_seed(seed)
    np.random.seed(seed)

    Xtr = torch.tensor(X_train_scaled, dtype=torch.float32)
    ytr = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
    Xte = torch.tensor(X_test_scaled, dtype=torch.float32)

    loader = DataLoader(TensorDataset(Xtr, ytr), batch_size=batch_size, shuffle=True)

    model = MLP(
        input_dim=X_train_scaled.shape[1],
        hidden_layers=hidden_layers,
        activation=activation,
        dropout=dropout
    )

    criterion = nn.BCEWithLogitsLoss()
    optimizer = _get_optimizer(optimizer_name, model.parameters(), lr=lr, weight_decay=weight_decay)

    model.train()
    for epoch in range(epochs):
        running = 0.0
        for xb, yb in loader:
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running += loss.item() * xb.size(0)

        if verbose_every and (epoch + 1) % verbose_every == 0:
            print(f"Epoch {epoch+1}/{epochs} - loss: {running/len(loader.dataset):.4f}")

    model.eval()
    with torch.no_grad():
        logits = model(Xte)
        prob = torch.sigmoid(logits).cpu().numpy().reshape(-1)
        pred = (prob >= threshold).astype(int)

    acc = accuracy_score(y_test, pred)
    ll = log_loss(y_test, prob, labels=[0, 1])

    results = {
        "test_accuracy": acc,
        "test_logloss": ll,
        "prob": prob,
        "pred": pred,
        "model": model
    }
    return results


LR Train Accuracy = 0.611376465479809
LR Test Accuracy  = 0.6246511627906977


Unnamed: 0,y_test,predicted,winning_probability
0,0,0,0.413945
1,1,1,0.586055
2,0,0,0.495544
3,1,1,0.504456
4,0,0,0.365512


In [10]:
res1 = train_mlp(
    X_train_scaled, y_train, X_test_scaled, y_test,
    activation="relu",
    optimizer_name="adam",
    hidden_layers=(32,16),
    dropout=0.2,
    lr=1e-3,
    epochs=80,
    batch_size=64,
    weight_decay=1e-4
)
print(res1["test_accuracy"], res1["test_logloss"])


Epoch 10/80 - loss: 0.6588
Epoch 20/80 - loss: 0.6567
Epoch 30/80 - loss: 0.6532
Epoch 40/80 - loss: 0.6572
Epoch 50/80 - loss: 0.6536
Epoch 60/80 - loss: 0.6540
Epoch 70/80 - loss: 0.6553
Epoch 80/80 - loss: 0.6517
0.6241860465116279 0.6452909805953095


In [11]:
res2 = train_mlp(
    X_train_scaled, y_train, X_test_scaled, y_test,
    activation="tanh",
    optimizer_name="adamw",
    hidden_layers=(16,),
    dropout=0.0,
    lr=3e-4,
    epochs=120,
    batch_size=128,
    weight_decay=1e-4
)
print(res2["test_accuracy"], res2["test_logloss"])


Epoch 10/120 - loss: 0.6557
Epoch 20/120 - loss: 0.6542
Epoch 30/120 - loss: 0.6538
Epoch 40/120 - loss: 0.6537
Epoch 50/120 - loss: 0.6535
Epoch 60/120 - loss: 0.6534
Epoch 70/120 - loss: 0.6533
Epoch 80/120 - loss: 0.6532
Epoch 90/120 - loss: 0.6531
Epoch 100/120 - loss: 0.6530
Epoch 110/120 - loss: 0.6529
Epoch 120/120 - loss: 0.6528
0.6302325581395349 0.6447623271121387


In [13]:
res = train_mlp(X_train_scaled, y_train, X_test_scaled, y_test,
    hidden_layers=(16,),
    activation="relu",
    optimizer_name="adam",
    lr=1e-3,
    dropout=0.0,
    weight_decay=1e-4,
    batch_size=128,
    epochs=200
)

print(res["test_accuracy"], res["test_logloss"])

Epoch 10/200 - loss: 0.6541
Epoch 20/200 - loss: 0.6533
Epoch 30/200 - loss: 0.6528
Epoch 40/200 - loss: 0.6525
Epoch 50/200 - loss: 0.6523
Epoch 60/200 - loss: 0.6521
Epoch 70/200 - loss: 0.6519
Epoch 80/200 - loss: 0.6516
Epoch 90/200 - loss: 0.6515
Epoch 100/200 - loss: 0.6515
Epoch 110/200 - loss: 0.6512
Epoch 120/200 - loss: 0.6510
Epoch 130/200 - loss: 0.6509
Epoch 140/200 - loss: 0.6506
Epoch 150/200 - loss: 0.6507
Epoch 160/200 - loss: 0.6509
Epoch 170/200 - loss: 0.6505
Epoch 180/200 - loss: 0.6505
Epoch 190/200 - loss: 0.6502
Epoch 200/200 - loss: 0.6502
0.6302325581395349 0.6442378125740446


In [15]:
res = train_mlp(X_train_scaled, y_train, X_test_scaled, y_test,
    hidden_layers=(32, 16),
    activation="relu",
    optimizer_name="sgd",
    lr=5e-3,
    dropout=0.0,
    weight_decay=1e-4,
    batch_size=128,
    epochs=300
)
print(res["test_accuracy"], res["test_logloss"])

Epoch 10/300 - loss: 0.6563
Epoch 20/300 - loss: 0.6542
Epoch 30/300 - loss: 0.6537
Epoch 40/300 - loss: 0.6532
Epoch 50/300 - loss: 0.6529
Epoch 60/300 - loss: 0.6526
Epoch 70/300 - loss: 0.6524
Epoch 80/300 - loss: 0.6525
Epoch 90/300 - loss: 0.6519
Epoch 100/300 - loss: 0.6517
Epoch 110/300 - loss: 0.6516
Epoch 120/300 - loss: 0.6513
Epoch 130/300 - loss: 0.6512
Epoch 140/300 - loss: 0.6508
Epoch 150/300 - loss: 0.6505
Epoch 160/300 - loss: 0.6503
Epoch 170/300 - loss: 0.6499
Epoch 180/300 - loss: 0.6498
Epoch 190/300 - loss: 0.6495
Epoch 200/300 - loss: 0.6492
Epoch 210/300 - loss: 0.6494
Epoch 220/300 - loss: 0.6491
Epoch 230/300 - loss: 0.6487
Epoch 240/300 - loss: 0.6486
Epoch 250/300 - loss: 0.6486
Epoch 260/300 - loss: 0.6479
Epoch 270/300 - loss: 0.6477
Epoch 280/300 - loss: 0.6475
Epoch 290/300 - loss: 0.6472
Epoch 300/300 - loss: 0.6471
0.6176744186046511 0.6458406383136627
