<a href="https://colab.research.google.com/github/dejiandrew/nba-award-predictor/blob/deji/pow_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import duckdb
import wget
import os

# Download CSV files
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/nba-all-stars.csv')
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/nba-mvp.csv')
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/all-nba-first-team.csv')
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/all-nba-second-team.csv')
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/all-nba-third-team.csv')
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/player-of-the-week.csv')
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/player-statistics.csv')
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/games.csv')

'games.csv'

In [9]:
games_df = pd.read_csv('games.csv')
games_df

  games_df = pd.read_csv('games.csv')


Unnamed: 0,gameId,gameDate,hometeamCity,hometeamName,hometeamId,awayteamCity,awayteamName,awayteamId,homeScore,awayScore,winner,gameType,attendance,arenaId,gameLabel,gameSubLabel,seriesGameNumber
0,22500231,2025-11-15T20:00:00Z,Milwaukee,Bucks,1610612749,Los Angeles,Lakers,1610612747,95,119,1610612747,,17341.0,,,,
1,22500232,2025-11-15T20:00:00Z,Minnesota,Timberwolves,1610612750,Denver,Nuggets,1610612743,112,123,1610612743,,,,,,
2,22500230,2025-11-15T19:00:00Z,Indiana,Pacers,1610612754,Toronto,Raptors,1610612761,111,129,1610612761,,17274.0,,,,
3,22500229,2025-11-15T19:00:00Z,Charlotte,Hornets,1610612766,Oklahoma City,Thunder,1610612760,96,109,1610612760,,19469.0,,,,
4,22500228,2025-11-15T17:00:00Z,Cleveland,Cavaliers,1610612739,Memphis,Grizzlies,1610612763,108,100,1610612739,,19432.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72136,24600083,1946-12-08 19:00:00,New York,Knicks,1610612752,Boston,Celtics,1610612738,62,44,1610612752,Regular Season,,0.0,,,
72137,24600078,1946-12-07 19:00:00,Boston,Celtics,1610612738,New York,Knicks,1610612752,65,90,1610612752,Regular Season,,0.0,,,
72138,24600076,1946-12-05 19:00:00,Philadelphia,Warriors,1610612744,New York,Knicks,1610612752,62,51,1610612744,Regular Season,,0.0,,,
72139,24600063,1946-11-30 19:00:00,New York,Knicks,1610612752,Philadelphia,Warriors,1610612744,64,60,1610612752,Regular Season,,0.0,,,


In [10]:
def build_team_games(df, filter=None):
    df = df.copy()

    # Parse datetime with mixed formats
    df['gamedate'] = pd.to_datetime(
        df['gamedate'],
        utc=True,
        errors='coerce',
        format='mixed',
        dayfirst=True,
    )

    if filter:
        df = df.query(filter)

    # Normalize: make two rows per game (home + away)
    home = df.rename(columns={
        'hometeamname': 'team',
        'hometeamid':   'teamid',
        'homescore':    'team_score',
        'awayscore':    'opp_score'
    })[['gameid','gamedate','team','teamid','team_score','opp_score','winner',
        'awayteamname','awayteamid']].assign(
            home = 1,
            opponent  = lambda x: x['awayteamname'],
            opponentid= lambda x: x['awayteamid'],
        ).drop(columns=['awayteamname','awayteamid'])

    away = df.rename(columns={
        'awayteamname': 'team',
        'awayteamid':   'teamid',
        'awayscore':    'team_score',
        'homescore':    'opp_score'
    })[['gameid','gamedate','team','teamid','team_score','opp_score','winner',
        'hometeamname','hometeamid']].assign(
            home = 0,
            opponent  = lambda x: x['hometeamname'],
            opponentid= lambda x: x['hometeamid'],
        ).drop(columns=['hometeamname','hometeamid'])

    long = pd.concat([home, away], ignore_index=True)

    # Outcome flags
    long['is_win']  = (long['teamid'] == long['winner']).astype(int)
    long['outcome'] = long['is_win'].map({1: 'win', 0: 'loss'})
    long['is_home_win'] = ((long['home'] == 1) & (long['is_win'] == 1)).astype(int)
    long['is_away_win'] = ((long['home'] == 0) & (long['is_win'] == 1)).astype(int)

    # Season (season starts in July)
    year = long['gamedate'].dt.year
    month = long['gamedate'].dt.month
    long['season'] = np.where(month >= 7, year, year - 1)

    # Sort for rolling calculations
    long = long.sort_values(['teamid', 'season', 'gamedate', 'gameid'], kind='mergesort')

    # Group by team + season for all season-based stats
    g = long.groupby(['teamid', 'season'], group_keys=False)

    # Games played prior (per season)
    long['games_prior'] = g.cumcount()

    # Wins / losses prior (per season)
    long['wins_prior'] = g['is_win'].transform(
        lambda s: s.shift().fillna(0).cumsum().astype(int)
    )
    long['losses_prior'] = long['games_prior'] - long['wins_prior']

    # Home / away games prior (per season)
    long['home_games_prior'] = g['home'].transform(
        lambda s: s.shift(fill_value=0).cumsum()
    )
    long['away_games_prior'] = long['games_prior'] - long['home_games_prior']

    # Home / away wins prior (per season)
    long['home_wins_prior'] = g['is_home_win'].transform(
        lambda s: s.shift().fillna(0).cumsum().astype(int)
    )
    long['away_wins_prior'] = g['is_away_win'].transform(
        lambda s: s.shift().fillna(0).cumsum().astype(int)
    )

    # Home / away losses prior (per season)
    long['home_losses_prior'] = long['home_games_prior'] - long['home_wins_prior']
    long['away_losses_prior'] = long['away_games_prior'] - long['away_wins_prior']

    # Win streaks prior (per season)
    def streak_prior(series):
        prior = series.shift().fillna(0).astype(int)
        # reset when we see a 0
        return prior.groupby((prior == 0).cumsum()).cumsum().astype(int)

    long['win_streak_prior'] = g['is_win'].transform(streak_prior)
    long['home_win_streak_prior'] = g['is_home_win'].transform(streak_prior)
    long['away_win_streak_prior'] = g['is_away_win'].transform(streak_prior)

    # Record strings (per season)
    long['record_prior'] = long['wins_prior'].astype(str) + '-' + long['losses_prior'].astype(str)
    long['home_record_prior'] = long['home_wins_prior'].astype(str) + '-' + long['home_losses_prior'].astype(str)
    long['away_record_prior'] = long['away_wins_prior'].astype(str) + '-' + long['away_losses_prior'].astype(str)

    # Opponent's prior record for same game (still season-based because wins_prior/losses_prior are)
    opp_prior = long[['gameid','teamid','wins_prior','losses_prior']].rename(
        columns={
            'teamid': 'opponentid',
            'wins_prior': 'opp_wins_prior',
            'losses_prior': 'opp_losses_prior'
        }
    )

    long = long.merge(opp_prior, on=['gameid','opponentid'], how='left')

    # Opponent winrate prior to the game
    opp_games_prior = long['opp_wins_prior'] + long['opp_losses_prior']
    long['opp_winrate_prior'] = long['opp_wins_prior'] / opp_games_prior.where(opp_games_prior > 0)

    # Flag for wins vs > .500 opponent (per game)
    long['is_win_vs_over_500'] = (
        (long['is_win'] == 1) &
        (long['opp_winrate_prior'] > 0.5)
    ).astype(int)

    # Per-team, per-season cumulative PRIOR wins vs > .500
    long['wins_vs_over_500_prior'] = (
        long.groupby(['teamid', 'season'])['is_win_vs_over_500']
            .transform(lambda s: s.shift().fillna(0).cumsum().astype(int))
    )

    # Week-based (Mon–Sun) record PRIOR, per-season
    iso = long['gamedate'].dt.isocalendar()
    long['week_year'] = iso['year']
    long['week_num']  = iso['week']

    gw = long.groupby(['teamid','season','week_year','week_num'], group_keys=False)

    long['week_games_prior'] = gw.cumcount()
    long['week_wins_prior'] = gw['is_win'].transform(
        lambda s: s.shift().fillna(0).cumsum().astype(int)
    )
    long['week_losses_prior'] = long['week_games_prior'] - long['week_wins_prior']

    long['week_record_prior'] = (
        long['week_wins_prior'].astype(str)
        + '-' +
        long['week_losses_prior'].astype(str)
    )

    # Final output
    out = long[['gameid','gamedate',
                'team','teamid',
                'opponent','opponentid',
                'outcome','home',
                'team_score','opp_score',
                'games_prior','wins_prior','losses_prior','record_prior',
                'home_games_prior','home_wins_prior','home_losses_prior','home_record_prior',
                'away_games_prior','away_wins_prior','away_losses_prior','away_record_prior',
                'win_streak_prior','home_win_streak_prior','away_win_streak_prior',
                'opp_wins_prior','opp_losses_prior','opp_winrate_prior',
                'is_win_vs_over_500','wins_vs_over_500_prior',
                'week_games_prior','week_wins_prior','week_losses_prior','week_record_prior',
                'season'
               ]]
    out.loc[:, 'gameid'] = out['gameid'].astype(int)
    out = out.rename(columns={'gameid': 'gameId'})

    return out.sort_values(['gamedate','gameId','home']).reset_index(drop=True)

In [15]:
games_df.columns = games_df.columns.str.lower()

In [18]:
game = build_team_games(games_df, "gamedate.dt.year >= 1979")
game

Unnamed: 0,gameId,gamedate,team,teamid,opponent,opponentid,outcome,home,team_score,opp_score,...,opp_wins_prior,opp_losses_prior,opp_winrate_prior,is_win_vs_over_500,wins_vs_over_500_prior,week_games_prior,week_wins_prior,week_losses_prior,week_record_prior,season
0,27800397,1979-01-01 20:00:00+00:00,Warriors,1610612744,SuperSonics,1610612760,win,0,110,97,...,0,0,,0,0,0,0,0,0-0,1978
1,27800397,1979-01-01 20:00:00+00:00,SuperSonics,1610612760,Warriors,1610612744,loss,1,97,110,...,0,0,,0,0,0,0,0,0-0,1978
2,27800398,1979-01-02 20:00:00+00:00,Nuggets,1610612743,Kings,1610612758,win,0,99,97,...,0,0,,0,0,0,0,0,0-0,1978
3,27800398,1979-01-02 20:00:00+00:00,Kings,1610612758,Nuggets,1610612743,loss,1,97,99,...,0,0,,0,0,0,0,0,0-0,1978
4,27800399,1979-01-02 20:00:00+00:00,Bulls,1610612741,Bullets,1610612764,loss,0,86,109,...,0,0,,0,0,0,0,0,0-0,1978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115497,22500230,2025-11-15 19:00:00+00:00,Pacers,1610612754,Raptors,1610612761,loss,1,111,129,...,11,7,0.611111,0,3,2,0,2,0-2,2025
115498,22500231,2025-11-15 20:00:00+00:00,Lakers,1610612747,Bucks,1610612749,win,0,119,95,...,11,6,0.647059,1,2,3,2,1,2-1,2025
115499,22500231,2025-11-15 20:00:00+00:00,Bucks,1610612749,Lakers,1610612747,loss,1,95,119,...,10,9,0.526316,0,6,3,2,1,2-1,2025
115500,22500232,2025-11-15 20:00:00+00:00,Nuggets,1610612743,Timberwolves,1610612750,win,0,123,112,...,10,8,0.555556,1,4,2,2,0,2-0,2025


In [39]:
game.isnull().sum()

query = """
WITH CTE AS (
SELECT
season
,team
,MIN(CAST(gamedate AS DATE)) AS min_gamedate
,MAX(CAST(gamedate AS DATE)) AS max_gamedate
FROM game
WHERE opp_winrate_prior IS NULL
GROUP BY season, team
)
SELECT *,
MONTH(min_gamedate) AS min_gamedate_month
,MONTH(max_gamedate) AS max_gamedate_month
FROM CTE
WHERE 1=1
AND min_gamedate_month NOT IN (10,11)
AND gamdedate NOT BETWEEN '2020-07-22' AND '2020-07-28'


"""

duckdb.query(query).df()

# Based on this result, the null columns in game["opp_winrate_prior"]

BinderException: Binder Error: Referenced column "gamdedate" not found in FROM clause!
Candidate bindings: "min_gamedate", "max_gamedate", "team"

In [None]:
game.isnull().sum()

query = """

SELECT
season
,team
,MIN(CAST(gamedate AS DATE)) AS min_gamedate
,MAX(CAST(gamedate AS DATE)) AS max_gamedate
FROM game
WHERE opp_winrate_prior IS NULL
GROUP BY season, team
ORDER BY season DESC

"""

duckdb.query(query).df()

# Based on this result, the null columns in game["opp_winrate_prior"]