In [21]:
# El obejtivo de este notebook sera definir las variables que necesitamos para los modelos: 
# el de clasificacion y el de regresion
# y comenzar a testear las conexiones con la api para despues generar el .py de get_data

# Modelo de clasificacion: Que Equipo Va a Ganar?

### üèÄ Game & Team Statistics ‚Äî Column Descriptions

| Column | Description |
|---------|-------------|
| **game_id** | Unique identifier for the game. |
| **date.start** | Date and time when the game started (used to calculate rest days). |
| **teams.visitors.id** | ID of the visiting (away) team. |
| **teams.home.id** | ID of the home team. |
| **scores.visitors.points** | Total points scored by the visiting team. |
| **scores.home.points** | Total points scored by the home team. |
| **fastBreakPoints** | Points scored on fast breaks. |
| **pointsInPaint** | Points scored inside the paint. |
| **biggestLead** | Largest lead held during the game. |
| **secondChancePoints** | Points scored after offensive rebounds (second chances). |
| **pointsOffTurnovers** | Points scored from opponent turnovers. |
| **longestRun** | Longest scoring run during the game. |
| **points** | Total team points. |
| **fgm** | Field goals made. |
| **fga** | Field goals attempted. |
| **fgp** | Field goal percentage (FG%). |
| **ftm** | Free throws made. |
| **fta** | Free throws attempted. |
| **ftp** | Free throw percentage (FT%). |
| **tpm** | Three-pointers made. |
| **tpa** | Three-pointers attempted. |
| **tpp** | Three-point percentage (3P%). |
| **offReb** | Offensive rebounds. |
| **defReb** | Defensive rebounds. |
| **totReb** | Total rebounds (offensive + defensive). |
| **assists** | Total assists. |
| **pFouls** | Personal fouls committed. |
| **steals** | Total steals. |
| **turnovers** | Total turnovers. |
| **blocks** | Total blocks. |
| **plusMinus** | Point differential while the team/player is on the court (+/-). |
| **min** | Minutes played. |

### üß† Note ‚Äî Psychological or Contextual Variables to Add Later
Columns that may capture psychological or contextual impact during games:

- **Is_Playoffs** ‚Üí Whether the game is part of the playoffs.  
- **Is_Finals** ‚Üí Whether the game is part of the NBA Finals.  
- **Game_In_Series** ‚Üí Game number within a playoff series (e.g., Game 1, Game 7).  
- **Series_Is_Elimination** ‚Üí Whether it‚Äôs an elimination game (team could be knocked out).  
- **Win_Streak** ‚Üí Current team win streak entering the game.  
- **Days_Since_Loss** ‚Üí Days since the team‚Äôs last loss.  
- **Time_of_Day_Factor** ‚Üí Factor related to the game‚Äôs start time (e.g., early, prime-time, late).  

In [1]:
# La idea sera con estas columnas hacer nuestro FE, el modelo no va a predecir al ganador basado en stats 
# que aun no conocemos (data lekeage) usaremos lags y stadisticas acumuladas hasta la fecha mas recientes antes del juego
# asi el modelo debe aprender a predecir al equipo ganandor, basandose en el comportamiento reciente de los 2 equipos

# Api Conection

In [2]:
import pandas as pd 
import numpy as np
import requests
import time
import os



# Clave personal de API
api_key = "89252707fcd6d430a0e3abbc35dddd7d"

# URL base (v1 para Basketball)
base_url = "https://v2.nba.api-sports.io"

# Headers comunes para todas las peticiones
headers = {
    "x-rapidapi-key": api_key,
    "x-rapidapi-host": "v2.nba.api-sports.io"
} 

# API GENERAL DOCUMENTATION : https://api-sports.io/documentation/nba/v2
# PYTHON CONECTION: https://api-sports.io/documentation/nba/v2#section/Sample-Scripts/Python

# Endpoints

In [3]:
# LEAGUES ENDPOINTS DOC: https://api-sports.io/documentation/nba/v2#tag/Leagues

# We need to know what id NBA league has

url = f"{base_url}/leagues" # '/leagues' is the endpoint

# request to the api
res = requests.get(url, headers=headers)
# we convert the json response to a python dict
data = res.json()
# the response section od the dict is a list od diccionaries with the asnwer, we want that list in a pandas df
leagues_df  = pd.DataFrame(data["response"], columns=['leagues'])

nba_league = leagues_df.query("leagues=='standard'")['leagues'].values[0]


In [4]:
# SEASONS ENDPOINTS DOC: https://api-sports.io/documentation/nba/v2#tag/Seasons

url = f"{base_url}/seasons" # '/seasons' is the endpoint

# request to the api
res = requests.get(url, headers=headers)
# we convert the json response to a python dict
data = res.json()
# the response section od the dict is a list od diccionaries with the asnwer, we want that list in a pandas df
seasons_df  = pd.DataFrame(data["response"], columns=['seasons'])

seasons_list = seasons_df['seasons'].unique().tolist()


In [5]:
# TEAMS ENDPOINT DOC: https://api-sports.io/documentation/nba/v2#tag/Teams
# We want a list of the NBA teams unqiue id's 

# Endpoint
url = f"{base_url}/teams"

# Par√°metros
params = {
    "league": nba_league}

# Request
res = requests.get(url, headers=headers, params=params)
data = res.json()

# Transformar respuesta a DataFrame
teams_df = pd.json_normalize(data["response"])
teams_df = teams_df[(teams_df['nbaFranchise']==True) & (teams_df['allStar']==False)].reset_index(drop=True)
teams_df = teams_df[['id', 'name', 'nickname', 'leagues.standard.conference', 'leagues.standard.division', 'city']]
teams_list = teams_df['id'].unique().tolist()

# Mostrar columnas relevantes
teams_df.head()

Unnamed: 0,id,name,nickname,leagues.standard.conference,leagues.standard.division,city
0,1,Atlanta Hawks,Hawks,East,Southeast,Atlanta
1,2,Boston Celtics,Celtics,East,Atlantic,Boston
2,4,Brooklyn Nets,Nets,East,Atlantic,Brooklyn
3,5,Charlotte Hornets,Hornets,East,Southeast,Charlotte
4,6,Chicago Bulls,Bulls,East,Central,Chicago


In [6]:
# GAMES ENDPOINT DOC: https://api-sports.io/documentation/nba/v2#tag/Games

url = f"{base_url}/games"

games_df = pd.DataFrame()

for season in seasons_list:

    params = {'season': season}
    res = requests.get(url, headers = headers, params = params)
    data = res.json()

    status = res.status_code
    results = data.get('results', 0)
    errors = data.get('errors', {})
    
    ### revision de errors
    if data.get('errors'):
        print(f"Season: {season} | Status: {status} | Results: {results} | Errors: {errors}")

    # ahora solo vamos a concatenar si es que hubo resultados
    if status == 200 and results > 0:

        df = pd.json_normalize(data['response'])
        games_df = pd.concat([games_df, df[['id',
                'season',
                'date.start',
                'teams.visitors.id',
                'teams.home.id',
                'scores.visitors.points',
                'scores.home.points',
                'arena.city']]], ignore_index=True)


# Filtrar solo juegos entre equipos NBA oficiales
games_df = games_df[
    (games_df['teams.home.id'].isin(teams_list)) & 
    (games_df['teams.visitors.id'].isin(teams_list))
].reset_index(drop=True)

games_df = games_df.rename(columns={'teams.visitors.id':'teams.away.id', 'scores.visitors.points':'scores.away.points', 'id':'game_id'})


games_id_list = games_df['game_id'].unique().tolist()

games_df.head(2)

Unnamed: 0,game_id,season,date.start,teams.away.id,teams.home.id,scores.away.points,scores.home.points,arena.city
0,1,2015,2015-10-03T02:30:00.000Z,9.0,16.0,96.0,103.0,Los Angeles
1,2,2015,2015-10-03T23:00:00.000Z,5.0,26.0,106.0,100.0,Orlando


# Raw JSON and Normalize

# Pivot_Wide format (_home _away stats 1 line per game)

In [7]:
gamestats_df = pd.read_parquet("gamestats.parquet")

In [None]:
# yo necesito primero que nada modificar el df de stats, aplanarlo par home y away y despues unirlo con game
# de entrada veo que game_stats, no dice quien es home y quien no es away entonces debo agregar esa info a gamestats antes de aplanarl
# todo lo guardaremos en un df temp para depsues unir con games_df

tmp = games_df[['game_id','teams.away.id','teams.home.id']]
tmp = gamestats_df.merge(tmp, on='game_id', how='left')
tmp['side'] = np.where(tmp['team.id'] == tmp['teams.home.id'],'home','away')

# aqui defino que columnas se deben separar por home y away, que deberian de ser todas estadisticas
stats_cols = ['fastBreakPoints',
       'pointsInPaint', 'biggestLead', 'secondChancePoints',
       'pointsOffTurnovers', 'longestRun', 'points', 'fgm', 'fga', 'fgp',
       'ftm', 'fta', 'ftp', 'tpm', 'tpa', 'tpp', 'offReb', 'defReb', 'totReb',
       'assists', 'pFouls', 'steals', 'turnovers', 'blocks', 'plusMinus']

       
tmp['ftp'] = tmp['ftp'].astype('float')
tmp['fgp'] = tmp['fgp'].astype('float')
tmp['tpp'] = tmp['tpp'].astype('float')
tmp['plusMinus'] = tmp['plusMinus'].astype('float')

stats_pivot = tmp.pivot_table(
    index = 'game_id',
    columns = 'side',
    values = stats_cols)

#wide.columns
#‚Üí MultiIndex([
#   ('fgm', 'home'),
#  ('fgm', 'visitors'),
# ('points', 'home'),
# ('points', 'visitors'),
#   ...])

stats_pivot.columns = [f"{stat}_{side}" for stat, side in stats_pivot.columns]
stats_pivot = stats_pivot.reset_index() # quitamos game_id del index y lo dejamos como columna

games_with_stats = games_df.merge(
    stats_pivot,
    on='game_id',
    how='left')

games_with_stats['winner_id'] = np.where(games_with_stats['scores.home.points'] > games_with_stats['scores.away.points'], games_with_stats['teams.home.id'], games_with_stats['teams.away.id'])

# Team-Game format (2 lines per game, 1 line each team )


In [17]:
# 1. Definir columnas comunes
common_cols = ['game_id', 'season', 'date.start', 'arena.city', 'winner_id']

# 2. Definir columnas por side
stats_cols = [col.replace('_home', '') for col in games_with_stats.columns if col.endswith('_home')]

# es como hacer games_with_stats[['col1, col2']] nada mas que suma listas ['col1'] + ['col2'] = ['col1, col2']
home_df = games_with_stats[common_cols + ['teams.home.id', 'teams.away.id']  + [f"{col}_home" for col in stats_cols]].copy()
# aqui solo renobramos para no tener el prefijo _home que ya no se necesita porque ahira ese df es puro home
home_df.columns = common_cols + ['team_id', 'opponent_id'] + stats_cols
home_df['is_home'] = 1



away_df = games_with_stats[common_cols + ['teams.away.id', 'teams.home.id'] + [f"{col}_away" for col in stats_cols]].copy()
away_df.columns = common_cols + ['team_id', 'opponent_id'] + stats_cols
away_df['is_home'] = 0

team_games_df = pd.concat([home_df, away_df], ignore_index=True)

# (Opcional: ordenar por fecha si quieres)
team_games_df = team_games_df.sort_values('date.start').reset_index(drop=True)



In [18]:
team_games_df

Unnamed: 0,game_id,season,date.start,arena.city,winner_id,team_id,opponent_id,assists,biggestLead,blocks,...,pointsInPaint,pointsOffTurnovers,secondChancePoints,steals,totReb,tpa,tpm,tpp,turnovers,is_home
0,1,2015,2015-10-03T02:30:00.000Z,Los Angeles,16.0,16.0,9.0,22.0,21.0,6.0,...,38.0,23.0,,12.0,39.0,32.0,9.0,28.1,13.0,1
1,1,2015,2015-10-03T02:30:00.000Z,Los Angeles,16.0,9.0,16.0,16.0,0.0,2.0,...,46.0,12.0,,9.0,57.0,18.0,7.0,38.9,23.0,0
2,2,2015,2015-10-03T23:00:00.000Z,Orlando,5.0,26.0,5.0,22.0,12.0,3.0,...,40.0,18.0,,11.0,45.0,26.0,6.0,23.1,12.0,1
3,3,2015,2015-10-03T23:00:00.000Z,Indianapolis,23.0,15.0,23.0,21.0,4.0,16.0,...,38.0,9.0,,10.0,62.0,35.0,10.0,28.6,19.0,1
4,3,2015,2015-10-03T23:00:00.000Z,Indianapolis,23.0,23.0,15.0,16.0,18.0,3.0,...,42.0,25.0,,9.0,55.0,32.0,11.0,34.4,11.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31277,16723,2025,2026-04-13T00:30:00.000Z,Minneapolis,23.0,22.0,23.0,,,,...,,,,,,,,,,1
31278,16722,2025,2026-04-13T00:30:00.000Z,Houston,19.0,14.0,19.0,,,,...,,,,,,,,,,1
31279,16721,2025,2026-04-13T00:30:00.000Z,Dallas,6.0,8.0,6.0,,,,...,,,,,,,,,,1
31280,16727,2025,2026-04-13T00:30:00.000Z,Inglewood,11.0,11.0,16.0,,,,...,,,,,,,,,,0


# Modelo de Regresion: Cuantos puntos habra?

In [3]:
import pandas as pd

In [4]:
import numpy as np