In [35]:
import sys
import os
sys.path.insert(0, os.path.abspath(".."))

In [None]:
from src.utils.db import get_nba_db, query, list_tables
import requests
import lxml
from typing import Optional
import cloudscraper
import pandas as pd
from lxml.html import HTMLParser
import re
import requests
from io import StringIO
from lxml import html

In [58]:
def get_csv_df(csv_name):
    url = f"https://storage.googleapis.com/nba_award_predictor/nba_data/{csv_name}"

    r = requests.get(url)
    r.raise_for_status()

    df = pd.read_csv(StringIO(r.text))
    return df

In [57]:
def strip_or_null(text: Optional[str]) -> Optional[str]:
    if text is not None:
        return text.strip() or None

def clean_txt(t: Optional[str]) -> Optional[str]:
    if t is None:
        return None
    t = t.strip()
    # repair common mojibake: UTF-8 misread as latin1
    if any(c in t for c in "ÃÂÄÅ"):
        try:
            t = t.encode("latin-1").decode("utf-8")
        except UnicodeError:
            pass
    return t

MOJI = re.compile(r"[ÃÂÄÅ]")  # markers that tell us mojibake happened

def fix_name(name: str) -> str:
    if not isinstance(name, str):
        return name
    s = name.strip()
    # Try standard mojibake repair
    if MOJI.search(s):
        try:
            s = s.encode("latin1").decode("utf-8")
        except Exception:
            pass  # fall through to dangling fix
    # Fix dangling lead byte "Ä" at the end (lost second byte \x87 => ć)
    if s.endswith("Ä"):
        s = s[:-1] + "ć"
    return s

def extract_weekly_winners(doc):
    def get_names(p, conf=None):
        if conf:
            conf_path = f'[contains(@data-desc, "{conf}")]'
        else:
            conf_path = ""
        names = [strip_or_null(t) for t in p.xpath(f".//a{conf_path}/text()") if strip_or_null(t)]
        return names
    
    def create_rows(out, season, month, date, names, c):
        tie = 0 if len(names) == 1 else 1
        for n in names:
            out.append({
                        "season": season,
                        "month": month,
                        "date": date,
                        "conference": c,
                        "player": n,
                        "tie": tie
                        })

    out = []
    # Each season block
    for group in doc.xpath("//div[@class='data_grid_group']"):
        season = strip_or_null(group.xpath("string(.//h3[1])"))
        if not season:
            continue
        # Each month box within the season group
        for box in group.xpath(".//div[contains(@class,'data_grid_box')]"):
            month = strip_or_null(box.xpath("string(.//div[@class='gridtitle'])"))
            if not month:
                continue
            for p in box.xpath(".//div/p[.//strong] | .//p[.//strong]"):
                date = strip_or_null(p.xpath("string(.//strong[1])"))
                if not date:
                    continue
                
                e_names = get_names(p, 'Eastern')
                if e_names:
                    create_rows(out, season, month, date, e_names, 'E')
                
                w_names = get_names(p, 'Western')
                if w_names:
                    create_rows(out, season, month, date, w_names, 'W')
                
                if not e_names and not w_names:
                    names = get_names(p)
                    create_rows(out, season, month, date, names, 'A')
    return out

MONTH_MAP = {
            "January": 1,
            "February": 2,
            "March": 3,
            "April": 4,
            "May": 5,
            "June": 6,
            "July": 7,
            "August": 8,
            "September": 9,
            "October": 10,
            "November": 11,
            "December": 12
           }
def get_player_of_week_df():
    def extract_year(row):
        if 6 < row['month'] <= 12:
            return int(row['season'][:4])
        else:
            return int(row['season'][:4]) + 1
    
    URL = "https://www.basketball-reference.com/awards/pow.html"
    scraper = cloudscraper.create_scraper(browser={'custom': 'Chrome/124'})  # mimics a browser
    r = scraper.get(URL, timeout=20)
    parser = HTMLParser(encoding="utf-8")
    doc = html.fromstring(r.content, parser=parser)

    out = extract_weekly_winners(doc)
    pow_df = pd.DataFrame(out)
    pow_df['month'] = pow_df['month'].apply(lambda x: MONTH_MAP[x])
    pow_df['day'] = pow_df['date'].str.split().str[-1].astype(int)
    pow_df['year'] = pow_df.apply(extract_year, axis=1)
    pow_df['firstname'] = pow_df['player'].str.split().str[0]
    pow_df['lastname'] = pow_df['player'].str.split().str[-1]

    return pow_df[['season', 'year', 'month', 'day', 'conference', 'player', 'firstname', 'lastname', 'tie']]

def build_team_games(df, filter=None):
    df = df.copy()

    # Parse datetime (your sample is ISO-8601 with Z)
    df['gamedate'] = pd.to_datetime(df['gamedate'], utc=True, errors='coerce')

    if filter:
        df = df.query(filter)

    # Normalize: make two rows per game (home row + away row), attach opponent info
    home = df.rename(columns={
        'hometeamname': 'team',
        'hometeamid':   'teamid',
        'homescore':    'team_score',
        'awayscore':    'opp_score'
    })[['gameid','gamedate','team','teamid','team_score','opp_score','winner',
        'awayteamname','awayteamid']].assign(
            home = 1,
            opponent  = lambda x: x['awayteamname'],
            opponentid= lambda x: x['awayteamid'],
        ).drop(columns=['awayteamname','awayteamid'])

    away = df.rename(columns={
        'awayteamname': 'team',
        'awayteamid':   'teamid',
        'awayscore':    'team_score',
        'homescore':    'opp_score'
    })[['gameid','gamedate','team','teamid','team_score','opp_score','winner',
        'hometeamname','hometeamid']].assign(
            home = 0,
            opponent  = lambda x: x['hometeamname'],
            opponentid= lambda x: x['hometeamid'],
        ).drop(columns=['hometeamname','hometeamid'])

    long = pd.concat([home, away], ignore_index=True)

    # Outcome flags
    long['is_win']  = (long['teamid'] == long['winner']).astype(int)
    long['outcome'] = long['is_win'].map({1: 'win', 0: 'loss'})

    # Flags for home/away and those wins
    long['is_home_win'] = ((long['home'] == 1) & (long['is_win'] == 1)).astype(int)
    long['is_away_win'] = ((long['home'] == 0) & (long['is_win'] == 1)).astype(int)

    # Sort for rolling calculations (stable sort keeps home before away if same keys)
    long = long.sort_values(['teamid', 'gamedate', 'gameid'], kind='mergesort')

    # Rolling-to-date stats (exclude current game via shift before cumsum)
    g = long.groupby('teamid', group_keys=False)

    # Games played prior
    long['games_prior'] = g.cumcount()

    # Wins prior (overall)
    long['wins_prior'] = g['is_win'].apply(lambda s: s.shift().fillna(0).cumsum().astype(int))
    long['losses_prior'] = long['games_prior'] - long['wins_prior']

    # Home/away games prior
    long['home_games_prior'] = g['home'].apply(lambda s: s.shift(fill_value=0).cumsum())
    long['away_games_prior'] = long['games_prior'] - long['home_games_prior']

    # Home/away wins prior
    long['home_wins_prior'] = g['is_home_win'].apply(lambda s: s.shift().fillna(0).cumsum().astype(int))
    long['away_wins_prior'] = g['is_away_win'].apply(lambda s: s.shift().fillna(0).cumsum().astype(int))

    # Home/away losses prior
    long['home_losses_prior'] = long['home_games_prior'] - long['home_wins_prior']
    long['away_losses_prior'] = long['away_games_prior'] - long['away_wins_prior']

    # Win streaks prior (overall + optional home/away)
    def streak_prior(series: pd.Series) -> pd.Series:
        # Count consecutive 1s in the prior games (current game excluded)
        prior = series.shift().fillna(0).astype(int)
        # Reset when we see a 0
        return prior.groupby((prior == 0).cumsum()).cumsum().astype(int)

    long['win_streak_prior'] = g['is_win'].transform(streak_prior)
    long['home_win_streak_prior'] = g['is_home_win'].transform(streak_prior)
    long['away_win_streak_prior'] = g['is_away_win'].transform(streak_prior)

    # record strings
    long['record_prior'] = long['wins_prior'].astype(str) + '-' + long['losses_prior'].astype(str)
    long['home_record_prior'] = long['home_wins_prior'].astype(str) + '-' + long['home_losses_prior'].astype(str)
    long['away_record_prior'] = long['away_wins_prior'].astype(str) + '-' + long['away_losses_prior'].astype(str)

    # 8) Final selection / ordering
    out = long[['gameid','gamedate',
                'team','teamid',
                'opponent','opponentid',
                'outcome','home',
                'team_score','opp_score',
                'games_prior','wins_prior','losses_prior','record_prior',
                'home_games_prior','home_wins_prior','home_losses_prior','home_record_prior',
                'away_games_prior','away_wins_prior','away_losses_prior','away_record_prior',
                'win_streak_prior','home_win_streak_prior','away_win_streak_prior']]

    return out.sort_values(['gamedate','gameid','home']).reset_index(drop=True)

SPECIAL_LASTNAMES = {
                    "Dončić": "Doncic",
                    "Şengün": "Sengun",
                    "Porziņģis": "Porzingis",
                    "Jokić": "Jokic",
                    "Vučević": "Vucevic",
                    "Dragić": "Dragic",
                    "Peković": "Pekovic",
                    "Bogdanović": "Bogdanovic",
                    "Schröder": "Schroder",
                    "Vásquez": "Vasquez",
                    "İlyasova": "Ilyasova",
                    "Ginóbili": "Ginobili",
                    "Türkoğlu": "Turkoglu",
                    "Stojaković": "Stojakovic",
                    "Petrović": "Petrovic",
                    "Peace": "World Peace"
                    }

SPECIAL_FIRSTNAMES = {
                     "Dražen": "Drazen",
                     "Fat": "Lafayette",
                     "J.R.": "JR",
                     "Cliff": "Clifford",
                     "Hot": "John",
                     "Tiny": "Nate",
                     }

def remove_special_names(row):
    if row['lastname'] == "Jr.":
        row['lastname'] = row['player'].split()[-2]
    if row['player'] == "Steve Smith":
        row['firstname'] = "Steven"
    if row['player'] == 'Billy Ray Bates':
        row['firstname'] = "Billyray"
    if row['player'] == "Joe Barry Carroll":
        row['firstname'] = "Joe Barry"
    if row['player'] == "Michael Ray Richardson":
        row['firstname'] = "Micheal Ray"
    if row['player'] == "Cliff Robinson":
        row['firstname'] = "Clifford"
    row['firstname'] = SPECIAL_FIRSTNAMES.get(row['firstname'], row['firstname'])
    row['lastname'] = SPECIAL_LASTNAMES.get(row['lastname'], row['lastname'])
    row['player'] = row['firstname'] + " " + row['lastname']
    return row
    
    


In [40]:
game = query("""SELECT * FROM games""")
game['year'] = game['gamedate'].apply(lambda x: int(x[:4]))
game = game[game['year'] >= 1979]
l1 = len(game)
game = build_team_games(game)
print(f"Original row count: {l1} \nNew row count: {len(game)} \nCorrect output count: {l1 * 2 == len(game)}")

Original row count: 57734 
New row count: 115468 
Correct output count: True


In [41]:
game.head()

Unnamed: 0,gameid,gamedate,team,teamid,opponent,opponentid,outcome,home,team_score,opp_score,...,home_wins_prior,home_losses_prior,home_record_prior,away_games_prior,away_wins_prior,away_losses_prior,away_record_prior,win_streak_prior,home_win_streak_prior,away_win_streak_prior
0,12500008,2025-10-02 12:00:00+00:00,76ers,1610612755,Knicks,1610612752,loss,0,84,99,...,0,0,0-0,0,0,0,0-0,0,0,0
1,12500008,2025-10-02 12:00:00+00:00,Knicks,1610612752,76ers,1610612755,win,1,99,84,...,0,0,0-0,0,0,0,0-0,0,0,0
2,12500009,2025-10-03 05:30:00+00:00,United,15016,Pelicans,1610612740,loss,0,97,107,...,0,0,0-0,0,0,0,0-0,0,0,0
3,12500009,2025-10-03 05:30:00+00:00,Pelicans,1610612740,United,15016,win,1,107,97,...,0,0,0-0,0,0,0,0-0,0,0,0
4,12500001,2025-10-03 22:00:00+00:00,Suns,1610612756,Lakers,1610612747,win,0,103,81,...,0,0,0-0,0,0,0,0-0,0,0,0


In [42]:
player_stats_df = get_csv_df('player-statistics.csv')

  df = pd.read_csv(StringIO(r.text))


In [43]:
player_stats_df.head()

Unnamed: 0,firstName,lastName,full_name,player_id,gameId,gameDate,playerteamCity,playerteamName,opponentteamCity,opponentteamName,...,threePointersPercentage,freeThrowsAttempted,freeThrowsMade,freeThrowsPercentage,reboundsDefensive,reboundsOffensive,reboundsTotal,foulsPersonal,turnovers,plusMinusPoints
0,Chris,Paul,Chris Paul,101108,22500224,2025-11-12T22:30:00Z,LA,Clippers,Denver,Nuggets,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Kris,Dunn,Kris Dunn,1627739,22500224,2025-11-12T22:30:00Z,LA,Clippers,Denver,Nuggets,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,5.0,1.0,-8.0
2,Jamal,Murray,Jamal Murray,1627750,22500224,2025-11-12T22:30:00Z,Denver,Nuggets,LA,Clippers,...,0.2,4.0,4.0,1.0,4.0,1.0,5.0,0.0,3.0,11.0
3,Ivica,Zubac,Ivica Zubac,1627826,22500224,2025-11-12T22:30:00Z,LA,Clippers,Denver,Nuggets,...,0.0,0.0,0.0,0.0,3.0,6.0,9.0,2.0,2.0,-10.0
4,Derrick,Jones Jr.,Derrick Jones Jr.,1627884,22500224,2025-11-12T22:30:00Z,LA,Clippers,Denver,Nuggets,...,0.0,0.0,0.0,0.0,2.0,2.0,4.0,2.0,2.0,-13.0


In [45]:
player_of_week_df_2 = get_csv_df('player-of-the-week.csv')

In [47]:
player_of_week_rf = player_of_week_df_2[['player_id', 'player']]

In [48]:
player_of_week_rf.head()

Unnamed: 0,player_id,player
0,1630178,Tyrese Maxey
1,1628983,Shai Gilgeous-Alexander
2,1641705,Victor Wembanyama
3,203507,Giannis Antetokounmpo
4,201935,James Harden


In [54]:
pow_df = get_player_of_week_df()
pow_df = pow_df.apply(remove_special_names, axis=1)

In [55]:
pow_df.head()

Unnamed: 0,season,year,month,day,conference,player,firstname,lastname,tie
0,2025-26,2025,10,26,E,Giannis Antetokounmpo,Giannis,Antetokounmpo,0
1,2025-26,2025,10,26,W,Victor Wembanyama,Victor,Wembanyama,0
2,2025-26,2025,11,2,E,Tyrese Maxey,Tyrese,Maxey,0
3,2025-26,2025,11,2,W,Shai Gilgeous-Alexander,Shai,Gilgeous-Alexander,0
4,2025-26,2025,11,9,E,Cade Cunningham,Cade,Cunningham,0


In [578]:
pow_df = get_player_of_week_df()
pow_df = pow_df.apply(remove_special_names, axis=1)
l1 = len(pow_df)

players = query("""SELECT * FROM players""")
player_id = players[['personid', 'firstname', 'lastname']]
pow_df_2 = pd.merge(pow_df, player_id, on=['firstname', 'lastname'], how='inner')
print(f"Successfully added player IDs: {len(pow_df) == l1}")


"""
unique_winners_id = pow_df['personid'].unique()
winners_count = len(unique_winners)
"""
#player_stats = query("""SELECT * FROM playerstatistics""")
"""
winners_stats = player_stats[player_stats['personid'].isin(unique_winners_id)]
print(f"Sucessfully filtered winners stats: {winners_stats['personid'].nunique == winners_count}")
if winners_stats['personid'].nunique != winners_count:
    set1 = set(winners_stats['player'].unique())
    set2 = set(pow_df['player'].unique())
    print(f"Missing players: {list(set2 - set1)}")

l1 = len(winners_stats)
winners_stats = pd.merge(winners_stats, game, on="gameid", how="inner")
print(f"Successfully added team record features: {len(winners_stats) == l1}")
"""





Successfully added player IDs: True


'\nwinners_stats = player_stats[player_stats[\'personid\'].isin(unique_winners_id)]\nprint(f"Sucessfully filtered winners stats: {winners_stats[\'personid\'].nunique == winners_count}")\nif winners_stats[\'personid\'].nunique != winners_count:\n    set1 = set(winners_stats[\'player\'].unique())\n    set2 = set(pow_df[\'player\'].unique())\n    print(f"Missing players: {list(set2 - set1)}")\n\nl1 = len(winners_stats)\nwinners_stats = pd.merge(winners_stats, game, on="gameid", how="inner")\nprint(f"Successfully added team record features: {len(winners_stats) == l1}")\n'

In [574]:
unique_players = pow_df_2['player'].unique()

for p in unique_players:
    c = pow_df_2[pow_df_2['player'] == p]['personid'].nunique()
    if c > 1:
        print(p)

Luka Doncic
Bojan Bogdanovic
Mike James
Glen Rice
Steven Smith
Loy Vaught
Patrick Ewing
Larry Johnson
Scott Skiles
Eddie Johnson
Mike Dunleavy


In [None]:
pow_df = get_player_of_week_df()
pow_df = pow_df.apply(remove_special_names, axis=1)
player_id_unique = (players
    .sort_values('personid')
    .drop_duplicates(subset=['firstname','lastname'], keep='first')
    [['personid','firstname','lastname']])
l1 = len(player_id_unique)

players = query("""SELECT * FROM players""")
player_id = players[['personid', 'firstname', 'lastname']]
pow_df = pd.merge(pow_df, player_id, on=['firstname', 'lastname'], how='left')
print(f"Successfully added player IDs: {len(pow_df) == l1}")

Successfully added player IDs: False


In [558]:
pow_df

Unnamed: 0,season,year,month,day,conference,player,firstname,lastname,tie,personid
0,2025-26,2025,10,26,W,Victor Wembanyama,Victor,Wembanyama,0,1641705
1,2024-25,2024,10,28,E,Jayson Tatum,Jayson,Tatum,0,1628369
2,2024-25,2024,10,28,W,Anthony Davis,Anthony,Davis,0,203076
3,2024-25,2024,11,4,E,Donovan Mitchell,Donovan,Mitchell,0,1628378
4,2024-25,2024,11,4,W,Devin Booker,Devin,Booker,0,1626164
...,...,...,...,...,...,...,...,...,...,...
1619,1979-80,1980,3,2,A,Larry Bird,Larry,Bird,0,1449
1620,1979-80,1980,3,9,A,Clifford Robinson,Clifford,Robinson,0,361
1621,1979-80,1980,3,16,A,Magic Johnson,Magic,Johnson,0,77142
1622,1979-80,1980,3,23,A,Billyray Bates,Billyray,Bates,0,76121


In [559]:
l1

1563