In [None]:
import pandas as pd 
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
### /{first letter last name}/{first 5 letters first name}{first 2 letters last name}01.htm

# test_url = r'https://www.basketball-reference.com/players/s/sabondo01.html'

# soup = BeautifulSoup(test_url, 'html.parser')

In [None]:
# from nba_api.stats.endpoints import playercareerstats

# # Nikola Jokić
# career = playercareerstats.PlayerCareerStats(player_id='203999') 

# # pandas data frames (optional: pip install pandas)
# career.get_data_frames()[0]

# # json
# career.get_json()

# # dictionary
# career.get_dict()



In [None]:
# from nba_api.live.nba.endpoints import scoreboard

# # Today's Score Board
# games = scoreboard.ScoreBoard()

# # json
# games.get_json()

# # dictionary
# games.get_dict()

In [None]:
from nba_api.stats.static import players

### Getting a list of all players (active or otherwise):

# Get all players
all_players = players.get_players()

# Get only active players
active_players = players.get_active_players()

### does not include rookies
players_df = pd.DataFrame(active_players)

players_df.head()

In [None]:
from nba_api.stats.endpoints import commonplayerinfo

# Assuming you have a player_id (e.g., from the all_players list)
player_id = '2544' # Example: LeBron James
player_info = commonplayerinfo.CommonPlayerInfo(player_id=player_id)
player_info_df = player_info.get_data_frames()[0]

player_info_df[['POSITION', 'PERSON_ID']]

In [None]:
from nba_api.stats.endpoints import playercareerstats

### Retrieving common player information (e.g., height, weight, draft info):

# Assuming you have a player_id
player_id = '2544' # Example: LeBron James
career_stats = playercareerstats.PlayerCareerStats(player_id=player_id)
career_stats_df = career_stats.get_data_frames()[0]

career_stats_df

In [None]:
from nba_api.stats.endpoints import playergamelog

### Fetching player game logs for a specific season:

# Assuming you have a player_id and want data for a specific season
player_id = '1629638' # Example: LeBron James
season_year = '2024-25'
game_logs = playergamelog.PlayerGameLog(player_id=player_id, season=season_year)
game_logs_df = game_logs.get_data_frames()[0]

game_logs_df.drop(columns=['Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'SEASON_ID', 'VIDEO_AVAILABLE']).groupby('Player_ID').sum().reset_index()

In [None]:
import time
import random
import pandas as pd
from tqdm import tqdm
from nba_api.stats.endpoints import playergamelog
from requests.exceptions import ReadTimeout, ConnectionError
from nba_api.stats.static import players

In [None]:
# Get only active players
active_players = players.get_active_players()

### does not include rookies
players_df = pd.DataFrame(active_players)

players_df = players_df[['id', 'full_name']]
season_year = '2024-25'

BATCH_SIZE = 100
dfs = []

for batch in range(0, len(players_df), BATCH_SIZE):
    time.sleep(0.5 + random.random() * 0.5)
    curr_data = players_df.iloc[batch:batch + BATCH_SIZE]

    for pid in tqdm(curr_data['id']):
        # simple retry loop with exponential backoff
        max_tries = 2
        delay = 0.8
        for attempt in range(1, max_tries + 1):
            try:
                stats = playergamelog.PlayerGameLog(
                    player_id=int(pid),
                    season=season_year,
                    timeout=20  # bump default timeout
                )
                df = stats.get_data_frames()[0].drop(
                    columns=['Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'SEASON_ID', 'VIDEO_AVAILABLE'],
                    errors='ignore'
                )
                # df = df.groupby('Player_ID').mean()
                dfs.append(df)
                # polite pause to reduce rate-limit risk
                time.sleep(0.5 + random.random() * 0.5)
                break  # success -> leave retry loop

            except (ReadTimeout, ConnectionError) as e:
                if attempt == max_tries:
                    print(f"Skipping player {pid} after {attempt} attempts due to: {type(e).__name__}")
                else:
                    # exponential backoff
                    sleep_s = delay * (2 ** (attempt - 1)) + random.random()
                    time.sleep(sleep_s)

            except Exception as e:
                # catch-all so one odd case doesn't kill the job
                print(f"Player {pid} failed with {type(e).__name__}: {e}")
                break

player_stats = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()


In [None]:
# BATCH_SIZE = 100
# dfs = []

# for batch in range(0, len(players_df), BATCH_SIZE):
#     time.sleep(0.75 + random.random() * 0.5)
#     curr_data = players_df.iloc[batch:batch + BATCH_SIZE]

#     for pid in tqdm(curr_data['id']):
#         # simple retry loop with exponential backoff
#         max_tries = 2
#         delay = 1
#         for attempt in range(1, max_tries + 1):
#             try:
#                 player_info = commonplayerinfo.CommonPlayerInfo(player_id=pid, timeout=20)
#                 player_info_df = player_info.get_data_frames()[0][['PERSON_ID', 'POSITION']]
#                 dfs.append(player_info_df)
#                 # polite pause to reduce rate-limit risk
#                 time.sleep(0.5 + random.random() * 0.5)
#                 break  # success -> leave retry loop

#             except (ReadTimeout, ConnectionError) as e:
#                 if attempt == max_tries:
#                     print(f"Skipping player {pid} after {attempt} attempts due to: {type(e).__name__}")
#                 else:
#                     # exponential backoff
#                     sleep_s = delay * (2 ** (attempt - 1)) + random.random()
#                     time.sleep(sleep_s)

#             except Exception as e:
#                 # catch-all so one odd case doesn't kill the job
#                 print(f"Player {pid} failed with {type(e).__name__}: {e}")
#                 break

# player = pd.concat(dfs, axis=0) if dfs else pd.DataFrame()


In [None]:
testing_groupby = player_stats.copy()

rebounds = testing_groupby['REB']
assists = testing_groupby['AST']
points = testing_groupby['PTS'] * 0.5
turnovers = testing_groupby['TOV'] * -1

### double doubles/ triple doubles
cats = ['PTS', 'REB', 'AST', 'STL', 'BLK']
counts = (testing_groupby[cats] >= 10).sum(axis=1)

testing_groupby['double_double'] = counts >= 2
testing_groupby['triple_double'] = counts >= 3


testing_groupby['double_double'] = [1 if i else 0 for i in testing_groupby['double_double']]
testing_groupby['triple_double'] = [2 if i else 0 for i in testing_groupby['triple_double']]

testing_groupby['fantasy_score'] = rebounds + assists + points + turnovers + testing_groupby['double_double'] + testing_groupby['triple_double'] 

testing_groupby.sort_values('fantasy_score')

test = testing_groupby.groupby('Player_ID').mean()

test2 = players_df.merge(test, right_on='Player_ID', left_on='id', how='inner').sort_values('fantasy_score')

test2['fantasy_score'] = test2['fantasy_score'].apply(lambda x: round(x, 2))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate, train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [None]:
test2.head()

In [None]:
scaler = StandardScaler()

X = test2.drop(columns=['id', 'full_name', 'fantasy_score']).values
y = test2['fantasy_score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

X_train_scaled =  scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linreg = LinearRegression()

linreg.fit(X_train_scaled, y_train)

linreg.score(X_test_scaled, y_test)