# Imports

In [1]:
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Definition for Scraping the Player Stats for a Total Season

In [2]:
def get_player_total_season_stats(year):
    """
    Retrieves player season total stats for a given NBA season and saves them to a CSV file.
    
    This function uses an API wrapper to fetch player season total statistics for the specified NBA season, saves the data to a CSV file, drops the 'slug' column from the dataset, and then saves the updated dataset back to the CSV file.
    
    Parameters:
    -----------
    year : int
        The ending year of the NBA season for which to retrieve player statistics. For example, to get statistics for the 2022-2023 season, pass 2023 as the year.
    
    Returns:
    --------
    None
        It saves the player season total statistics to a CSV file.

    Example:
    --------
    To retrieve and save player statistics for the 2022-2023 NBA season:
    >>> get_player_total_season_stats(2023)
    """
    #from the api wrapper
    client.players_season_totals(
        season_end_year= year, 
        output_type=OutputType.CSV, 
        output_file_path=f"./total_stats/{year-1}_{year}_player_season_totals.csv"
    )
    #dropping slug column
    df = pd.read_csv(f'./total_stats/{year-1}_{year}_player_season_totals.csv')
    df.drop(columns = 'slug', inplace = True)
    return df.to_csv(f"./total_stats/{year-1}_{year}_player_season_totals.csv", index = False)

# EDA for the 2022/2023 Season Total Stats and Creating Functions to Automate the Cleaning

Not running the below cell again as I have already scraped the data.

In [3]:
#eda for player totals
df_total = pd.read_csv('./total_stats/2022_2023_player_season_totals.csv')
df_total.isnull().sum()

name                                 0
positions                            0
age                                  0
team                                 0
games_played                         0
games_started                        0
minutes_played                       0
made_field_goals                     0
attempted_field_goals                0
made_three_point_field_goals         0
attempted_three_point_field_goals    0
made_free_throws                     0
attempted_free_throws                0
offensive_rebounds                   0
defensive_rebounds                   0
assists                              0
steals                               0
blocks                               0
turnovers                            0
personal_fouls                       0
points                               0
dtype: int64

No null values.

In [4]:
df_total

Unnamed: 0,name,positions,age,team,games_played,games_started,minutes_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,Precious Achiuwa,CENTER,23,TORONTO RAPTORS,55,12,1140,196,404,29,...,87,124,100,228,50,31,30,59,102,508
1,Steven Adams,CENTER,29,MEMPHIS GRIZZLIES,42,42,1133,157,263,0,...,47,129,214,271,97,36,46,79,98,361
2,Bam Adebayo,CENTER,25,MIAMI HEAT,75,75,2598,602,1114,1,...,324,402,184,504,240,88,61,187,208,1529
3,Ochai Agbaji,SHOOTING GUARD,22,UTAH JAZZ,59,22,1209,165,386,81,...,56,69,43,78,67,16,15,41,99,467
4,Santi Aldama,POWER FORWARD,22,MEMPHIS GRIZZLIES,77,20,1682,247,525,94,...,108,144,85,286,97,45,48,60,143,696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,Thaddeus Young,POWER FORWARD,34,TORONTO RAPTORS,54,9,795,108,198,6,...,18,26,71,95,75,54,5,42,88,240
605,Trae Young,POINT GUARD,24,ATLANTA HAWKS,73,73,2541,597,1390,154,...,566,639,56,161,741,80,9,300,104,1914
606,Omer Yurtseven,CENTER,24,MIAMI HEAT,9,0,83,16,27,3,...,5,6,8,15,2,2,2,4,16,40
607,Cody Zeller,CENTER,30,MIAMI HEAT,15,2,217,37,59,0,...,24,35,25,39,10,3,4,14,33,98


In [5]:
df_total.describe()

Unnamed: 0,age,games_played,games_started,minutes_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
count,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0
mean,25.893268,42.518883,20.197044,976.743842,169.55665,356.681445,49.850575,138.20197,74.298851,95.042693,42.149425,133.315271,102.264368,29.446634,18.79803,54.252874,80.704433,463.262726
std,4.312631,25.081185,25.918279,807.544477,172.127488,356.427911,57.390304,152.025741,100.816793,123.998786,49.555581,132.38197,122.442202,27.244221,25.341943,56.206109,65.177501,479.765348
min,19.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,21.0,0.0,240.0,32.0,70.0,4.0,14.0,10.0,13.0,8.0,30.0,17.0,7.0,4.0,10.0,23.0,85.0
50%,25.0,43.0,5.0,786.0,108.0,230.0,27.0,86.0,34.0,45.0,26.0,92.0,56.0,22.0,11.0,36.0,67.0,288.0
75%,29.0,65.0,38.0,1670.0,257.0,530.0,74.0,216.0,96.0,129.0,57.0,197.0,137.0,45.0,24.0,81.0,125.0,694.0
max,42.0,82.0,82.0,2842.0,728.0,1559.0,301.0,731.0,669.0,772.0,274.0,744.0,741.0,128.0,193.0,300.0,279.0,2225.0


Nothing out of the ordinary.

In [6]:
df_total.shape

(609, 21)

In [7]:
df_total['name'].nunique()

539

It seems we have duplicate players if they got traded midseason. We need to combine these statistics.

In [8]:
df_total['name'].value_counts()

name
Dorian Finney-Smith    2
Thomas Bryant          2
Spencer Dinwiddie      2
Moses Brown            2
Eugene Omoruyi         2
                      ..
James Harden           1
Tim Hardaway Jr.       1
Jordan Hall            1
Tyrese Haliburton      1
Ivica Zubac            1
Name: count, Length: 539, dtype: int64

In [92]:
def combine_traded_player_total_stats(df):
    """
    Combines statistics for NBA players who were traded during the season for the player season total stats dataframe.
    
    This function identifies players who were traded during the season (i.e., appear more than once in the dataset with different teams), combines their statistics into a single row per player, and updates the DataFrame accordingly.
    
    Parameters:
    -----------
    df: pandas.DataFrame
        A DataFrame containing player total stats for a season.
        
    Returns:
    --------
    df: pandas.DataFrame
        The updated DataFrame with combined statistics for traded players.
        
    Example:
    --------
    To combine the stats for players who were traded during the 2022-2023 season:
    >>> df = pd.read_csv('2022_2023_player_season_totals.csv')
    >>> df_combined = combine_traded_player_total_stats(df)
    """
    player = df['name'].value_counts()
    multi_player = player[player > 1].index
    #create a new dataframe that just has the duplicate payers in it
    multi_df = df[df['name'].isin(multi_player)]
    df = df[df['name'].apply(lambda x: x not in multi_player)]

    #combine the stats from the duplicate players
    combo_stats = multi_df.groupby('name')[['games_played', 'games_started', 'minutes_played', 'made_field_goals', 'attempted_field_goals', 'made_three_point_field_goals', 'attempted_three_point_field_goals', 'made_free_throws', 'attempted_free_throws', 'offensive_rebounds', 'defensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points']].sum().reset_index()
    #combine the teams with a ',' in between
    combo_teams = multi_df.groupby('name')['team'].apply(lambda x: ', '.join(x.dropna().astype(str))).reset_index()
    #merge the dataframes
    combo_df = pd.merge(combo_stats, combo_teams, on='name')
    #only take the first position since it should be the same
    combo_positions = multi_df.groupby('name')['positions'].first().reset_index()
    #combine the dataframes
    combo_df = pd.merge(combo_df, combo_positions, on='name')
    #only take the first age since we dont want to sum them
    age = multi_df.groupby('name')['age'].first().reset_index()
    #combine dataframe
    combo_df = pd.merge(combo_df, age, on='name')
    #concat the dataframe with the combined stats with the old dataframe.
    df = pd.concat([df, combo_df], axis = 0).reset_index(drop = True)
    return df

In [34]:
df_total = combine_traded_player_total_stats(df_total)

In [35]:
def clean_total_stats(df):
    """
    Cleans the 'team' and 'positions' columns in the player total stats DataFrame.

    This function converts the values in the 'team' and 'positions' columns to title case to ensure consistent formatting.

    Parameters:
    -----------
    df: pandas.DataFrame
        A DataFrame containing player total stats for a season.
    Returns:
    --------
    df: pandas.DataFrame
        The updated DataFrame with the 'team' and 'positions' columns converted to title case.

    Example:
    --------
    To clean the 'team' and 'positions' columns in the DataFrame:
    >>> df = pd.read_csv('2022_2023_player_season_totals.csv')
    >>> df_cleaned = clean_total_stats(df)
    """
    #make the positions and team columns title case
    df['team'] = df['team'].str.title()
    df['positions'] = df['positions'].str.title()
    return df

In [36]:
df_total = clean_total_stats(df_total)
df_total

Unnamed: 0,name,positions,age,team,games_played,games_started,minutes_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,Precious Achiuwa,Center,23,Toronto Raptors,55,12,1140,196,404,29,...,87,124,100,228,50,31,30,59,102,508
1,Steven Adams,Center,29,Memphis Grizzlies,42,42,1133,157,263,0,...,47,129,214,271,97,36,46,79,98,361
2,Bam Adebayo,Center,25,Miami Heat,75,75,2598,602,1114,1,...,324,402,184,504,240,88,61,187,208,1529
3,Ochai Agbaji,Shooting Guard,22,Utah Jazz,59,22,1209,165,386,81,...,56,69,43,78,67,16,15,41,99,467
4,Santi Aldama,Power Forward,22,Memphis Grizzlies,77,20,1682,247,525,94,...,108,144,85,286,97,45,48,60,143,696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,T.J. Warren,Power Forward,29,"Brooklyn Nets, Phoenix Suns",42,0,687,132,270,22,...,28,35,26,95,40,22,12,20,65,314
535,Terrence Ross,Shooting Guard,31,"Orlando Magic, Phoenix Suns",63,9,1330,200,465,95,...,27,34,22,132,95,35,10,43,97,522
536,Terry Taylor,Power Forward,23,"Indiana Pacers, Chicago Bulls",31,2,264,39,75,5,...,6,11,27,20,10,3,6,9,26,89
537,Thomas Bryant,Center,25,"Los Angeles Lakers, Denver Nuggets",59,26,1081,230,369,26,...,93,126,86,253,31,16,30,38,102,579


# Scraping player advanced stats

In [93]:
def get_player_advanced_season_stats(year):
    """
    Retrieves and processes advanced player stats for a given season.

    This function uses an API to retrieve advanced player stats for the specified season, saves the data to a CSV file, and then processes the data by removing unnecessary columns.

    Parameters:
    -----------
    year : int
        The ending year of the season for which to retrieve the advanced player statistics.

    Returns:
    --------
    None
        It saves the player season total statistics to a CSV file.

    Example:
    --------
    To retrieve and save advanced player statistics for the 2022-2023 season:
    >>> get_player_advanced_season_stats(2023)
    """
    client.players_advanced_season_totals(
        season_end_year=year,
        output_type=OutputType.CSV,
        output_file_path=f"./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv"
)
#dropping duplicate and not needed columns 
    df = pd.read_csv(f'./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv')
    df.drop(columns = ['slug', 'positions', 'age', 'team', 'minutes_played', 'is_combined_totals'] , inplace = True)
    return df.to_csv(f"./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv", index = False)

Not running the below cell again as I have already scraped the data.

# EDA for player advanced stats

In [39]:
df_advanced = pd.read_csv('./advanced_season_stat_total/2022_2023_advanced_player_season_totals.csv')
df_advanced.isnull().sum()

name                             0
games_played                     0
player_efficiency_rating         0
true_shooting_percentage         0
three_point_attempt_rate         0
free_throw_attempt_rate          0
offensive_rebound_percentage     0
defensive_rebound_percentage     0
total_rebound_percentage         0
assist_percentage                0
steal_percentage                 0
block_percentage                 0
turnover_percentage              0
usage_percentage                 0
offensive_win_shares             0
defensive_win_shares             0
win_shares                       0
win_shares_per_48_minutes        0
offensive_box_plus_minus         0
defensive_box_plus_minus         0
box_plus_minus                   0
value_over_replacement_player    0
dtype: int64

No null values present.

In [40]:
df_advanced.describe()

Unnamed: 0,games_played,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,assist_percentage,steal_percentage,...,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player
count,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,...,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0
mean,42.518883,13.263218,0.55992,0.405148,0.245982,5.172578,14.99688,10.082102,13.333333,1.54335,...,12.622003,18.382759,1.074713,0.990312,2.065846,0.081337,-1.374548,-0.075041,-1.450082,0.487521
std,25.081185,6.237213,0.114687,0.222474,0.181543,4.280231,6.837965,4.806279,8.735297,1.316029,...,7.884842,5.835549,1.688998,0.965262,2.460253,0.088172,3.607156,2.131476,4.774525,1.141405
min,1.0,-20.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.9,0.0,-1.6,-0.517,-22.5,-10.4,-26.5,-1.3
25%,21.0,10.0,0.522,0.268,0.138,2.1,10.7,6.7,7.3,1.0,...,9.2,14.3,0.0,0.2,0.2,0.045,-3.0,-0.9,-3.3,-0.1
50%,43.0,13.0,0.566,0.414,0.227,3.8,13.4,8.9,11.1,1.4,...,11.8,17.7,0.5,0.7,1.2,0.086,-1.4,-0.1,-1.3,0.1
75%,65.0,16.3,0.61,0.553,0.323,7.1,18.7,12.6,17.7,1.8,...,15.1,21.3,1.5,1.6,3.1,0.129,0.5,0.8,0.6,0.7
max,82.0,65.6,1.064,1.0,2.0,28.8,55.4,29.6,47.6,24.2,...,100.0,52.5,11.2,4.8,14.9,0.626,17.0,32.7,48.6,8.8


In [41]:
df_advanced.shape

(609, 22)

In [42]:
df_advanced['name'].nunique()

539

Same issue as before. We have duplicate rows for players that were traded midseason.

In [94]:
def combine_traded_player_advanced_stats(df):
    """
    Combines advanced stats for players who were traded during a season.

    This function takes a DataFrame containing advanced player stats and combines the stats for players who were traded during the season. It then calculates the weighted average of the combined stats based on the number of games played on each team.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing advanced player statistics.

    Returns:
    --------
    pandas.DataFrame
        DataFrame with combined/weighted averaged advanced player statistics for players who were traded during the season.

    Example:
    --------
    To combine the stats for players who were traded during the 2022-2023 season:
    >>> df = pd.read_csv('2022_2023_advanced_player_season_totals.csv')
    >>> df_combined = combine_traded_player_total_stats(df)
    """
    player = df['name'].value_counts()
    multi_player = player[player > 1].index
    #create a new dataframe that just has the duplicate payers in it
    multi_df = df[df['name'].isin(multi_player)]
    df = df[df['name'].apply(lambda x: x not in multi_player)]

    #combine the stats from the duplicate players
    combo_stats = multi_df.groupby('name')[['player_efficiency_rating', 'true_shooting_percentage', 'three_point_attempt_rate', 'free_throw_attempt_rate', 'offensive_rebound_percentage', 'defensive_rebound_percentage', 'total_rebound_percentage', 'assist_percentage', 'steal_percentage', 'block_percentage', 'turnover_percentage', 'usage_percentage', 'offensive_win_shares', 'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes', 'offensive_box_plus_minus', 'defensive_box_plus_minus', 'box_plus_minus', 'value_over_replacement_player']].apply(lambda x: x.multiply(multi_df.loc[x.index, 'games_played'], axis=0)).groupby('name').sum()
    #combine total games for each player
    total_games = multi_df.groupby('name')['games_played'].sum()
    
    #calculate the averages
    combo_stats = combo_stats.divide(total_games, axis=0).reset_index()
    #concat the dataframe with the combined stats with the old dataframe.
    df = pd.concat([df, combo_stats], axis = 0).reset_index(drop = True)
    df.drop(columns = 'games_played', inplace = True)
    return df

In [44]:
df_advanced = combine_traded_player_advanced_stats(df_advanced)

In [45]:
#combine the total and advanced stats dataframes
df = pd.merge(df_advanced, df_total, on = 'name')

In [46]:
#confirming the correct shape
df.shape

(539, 41)

# scrape player per game stats per season

In [47]:
def get_per_game_stat(year):
    """
    Scrapes per-game basketball stats from Basketball Reference for a specific year and saves to as a pandas DataFrame.

    This function scrapes per-game stats of NBA players from Basketball Reference for a specified year. It retrieves stats such as player name, minutes per game (mpg), field goals made per game (fgm_per_g),
    field goals attempted per game (fga_per_g), field goal percentage (fg_pct), three-point field goals made per game (fg3m_per_g), three-point field goals attempted per game (fg3a_per_g), three-point field goal percentage (fg3_pct),
    two-point field goals made per game (fg2m_per_g), two-point field goals attempted per game (fg2a_per_g), two-point field goal percentage (fg2_pct), effective field goal percentage (efg_pct), free throws made per game (ftm_per_g), 
    free throws attempted per game (fta_per_g), free throw percentage (ft_pct), offensive rebounds per game (orb_per_g), defensive rebounds per game (drb_per_g), total rebounds per game (trb_per_g), assists per game (ast_per_g),
    steals per game (stl_per_g), blocks per game (blk_per_g), turnovers per game (tov_per_g), personal fouls per game (pf_per_g), and points per game (ppg).

    Parameters:
    -----------
    year : int
        The ending year of the season for which to retrieve the per-game stats.

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing per-game stats for NBA players in the specified year.

    Example:
    --------
    To scrape per-game stats for the year 2022-2023:
    >>> per_game_stats_2022 = get_per_game_stat(2023)
    """
    req = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html')
    soup = BeautifulSoup(req.content, 'html.parser')
    per_game_stat = []
    
    #the stats that I want in my dataframe
    stats = {
        'player': 'name',
        'mp_per_g' : 'mpg',
        'fg_per_g' : 'fgm_per_g',
        'fga_per_g' : 'fga_per_g',
        'fg_pct' : 'fg_pct',
        'fg3_per_g' : 'fg3m_per_g',
        'fg3a_per_g' : 'fg3a_per_g',
        'fg3_pct' : 'fg3_pct',
        'fg2_per_g' : 'fg2m_per_g',
        'fg2a_per_g' : 'fg2a_per_g',
        'fg2_pct' : 'fg2_pct',
        'efg_pct' : 'efg_pct',
        'ft_per_g' : 'ftm_per_g',
        'fta_per_g' : 'fta_per_g',
        'ft_pct' : 'ft_pct',
        'orb_per_g': 'orb_per_g',
        'drb_per_g' : 'drb_per_g',
        'trb_per_g' : 'trb_per_g',
        'ast_per_g' : 'ast_per_g',
        'stl_per_g' : 'stl_per_g',
        'blk_per_g' : 'blk_per_g',
        'tov_per_g' : 'tov_per_g',
        'pf_per_g' : 'pf_per_g',
        'pts_per_g' : 'ppg',
        #added team so that I can deal with traded players data
        'team_id': 'team'
    }
        
    #finding each player and their specific stat
    for stat in soup.find('tbody').find_all('tr'):
        per_game_stat_dict = {}
        player = stat.find('td', {'data-stat':'player'})
        #need to add if statement so it only recognizes player names that are filled out
        if player:
            player_found = player.find('a')
            if player_found:
                per_game_stat_dict['name'] = player_found.getText()
        #need to add a for loop to loop through the stats and find each key in the html dictionary
        for key, value in stats.items():
            if key != 'player':
                stat_value = stat.find('td', {'data-stat': key})
                if stat_value:
                    per_game_stat_dict[value] = stat_value.getText()  
        per_game_stat.append(per_game_stat_dict)
    #create a dataframe of our stats and drop any null values
    df = pd.DataFrame(per_game_stat)
    df.dropna(inplace = True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv(f'./per_game_stat/per_game_stat_{year-1}_{year}.csv', index = False)
    return df

# EDA on per game stats

Not running the below cell again as I have already scraped the data.

In [49]:
df_per = pd.read_csv('./per_game_stat/per_game_stat_2022_2023.csv')

In [50]:
def combine_traded_player_per_game_stats(df):
    """
    Combines per-game stats of players who were traded during the season.

    This function takes a DataFrame containing per-game stats and combines the stats of players who were traded during the season. It identifies players with multiple entries (indicating they were traded) 
    and combines their stats into a single entry.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing per-game stats of NBA players.

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing per-game stats with combined stats for traded players.

    Example:
    --------
    To combine the per-game stats for players who were traded during the 2022-2023 season:
    >>> df = pd.read_csv('per_game_stat_2022_2023.csv')
    >>> df_combined = combine_traded_player_per_game_stats(df)
    """
    player = df['name'].value_counts()
    multi_player = player[player > 1].index
    #create a new dataframe that just has the duplicate payers in it
    multi_df = df[df['name'].isin(multi_player)]
    df = df[df['name'].apply(lambda x: x not in multi_player)]
    grouped = multi_df.groupby('name')
    
    total_list = []
    
    for player, stats in grouped:
        if 'TOT' in stats['team'].values:
            total = stats[stats['team'] == 'TOT']
            total_list.append(total)
            
    total = pd.concat(total_list, axis = 0)
    
    df = pd.concat([total, df], axis = 0).reset_index(drop = True)
    df.drop(columns = 'team', inplace = True)
    return df

In [51]:
df_per = combine_traded_player_per_game_stats(df_per)

# Combine to one big df and do EDA

In [53]:
#combine the total and per year stats dataframes
df = pd.merge(df_per, df, on = 'name')

In [54]:
#Need to one hot encode positions
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539 entries, 0 to 538
Data columns (total 64 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   name                               539 non-null    object 
 1   mpg                                539 non-null    float64
 2   fgm_per_g                          539 non-null    float64
 3   fga_per_g                          539 non-null    float64
 4   fg_pct                             537 non-null    float64
 5   fg3m_per_g                         539 non-null    float64
 6   fg3a_per_g                         539 non-null    float64
 7   fg3_pct                            523 non-null    float64
 8   fg2m_per_g                         539 non-null    float64
 9   fg2a_per_g                         539 non-null    float64
 10  fg2_pct                            534 non-null    float64
 11  efg_pct                            537 non-null    float64

In [55]:
df = pd.get_dummies(df, columns=['positions'], dtype = int)

# Scraping the all_nba category

In [56]:
req = requests.get(f'https://www.basketball-reference.com/awards/all_league.html')
soup = BeautifulSoup(req.content, 'html.parser')
awards = []
    
# Extracting each player and their specific season
for row in soup.find('tbody').find_all('tr'):
    season = row.find('th', {'data-stat': 'season'})
    #need to make sure the season is there in order to get text and avoid errors
    if season:
        seasons = season.get_text()
    team = row.find('td', {'data-stat': 'all_team'})
    #need to make sure the team is there in order to get text and avoid errors
    if team:
        team_text = team.get_text()
        #was getting some blank values so needed to make sure there was something there otherwise it would be a null value that I can drop later
        if team_text:
            teams = team_text
     
    #the player were set up and counted from 1-15 so had to loop through it
    for i in range(1, 16):
        player = row.find('td', {'data-stat': str(i)})
        #same need to make sure the team is there in order to get text and avoid errors
        if player:
            player_found = player.find('a')
            if player_found:
                player_name = player_found.get_text()
                #need 
                player_dict = {
                    'season' : seasons,
                    'team' : teams,
                    'name' : player_name}
                #appending back to my list
                awards.append(player_dict)
#create a dataframe of list
df_all_nba = pd.DataFrame(awards)
df_all_nba.dropna(inplace = True)
df_all_nba = pd.get_dummies(df_all_nba, columns= ['team'], dtype = int)
df_all_nba

Unnamed: 0,season,name,team_1st,team_2nd,team_3rd
0,2023-24,Nikola Jokić,1,0,0
1,2023-24,Giannis Antetokounmpo,1,0,0
2,2023-24,Jayson Tatum,1,0,0
3,2023-24,Luka Dončić,1,0,0
4,2023-24,Shai Gilgeous-Alexander,1,0,0
...,...,...,...,...,...
1045,1946-47,Frankie Baumholtz,0,1,0
1046,1946-47,Ernie Calverley,0,1,0
1047,1946-47,Chick Halbert,0,1,0
1048,1946-47,John Logan,0,1,0


In [57]:
def get_award(award_list):
    """
    Scrape basketball reference for player awards.

    This function takes a list of basketball awards and scrapes basketball-reference.com to retrieve the players who received those awards for each season.

    Parameters:
    -----------
    award_list : list
        A list of strings containing the names of awards.

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing information about the players who received the specified awards for each season. The DataFrame has columns for 'season' (season in which the award was received),
        'name' (name of the player receiving the award), and each award in `award_list`, where a value of 1 indicates that the player received the award for that season, and 0 indicates
        that they did not receive the award.

    Example:
    --------
    To scrape basketball-reference.com for MVP and Rookie of the Year awards:
    >>> awards_df = get_award(['mvp', 'roy'])
    """
    all_awards = pd.DataFrame()
    for award in award_list:
        req = requests.get(f'https://www.basketball-reference.com/awards/{award}.html')
        soup = BeautifulSoup(req.content, 'html.parser')
        awards = []
    
        # Extracting each player and their specific season
        for row in soup.find('tbody').find_all('tr'):
            award_dict = {}
            season = row.find('th', {'data-stat': 'season'})
            if season:
                award_dict['season'] = season.get_text()

            player = row.find('td', {'data-stat': 'player'})
            if player:
                player_found = player.find('a')
                if player_found:
                    award_dict['name'] = player_found.getText()
            #appending back to my list
            awards.append(award_dict)
        #create a dataframe of list
        df = pd.DataFrame(awards)
        df.reset_index(drop=True, inplace=True)
        #adding a binary column to match award
        df[award.upper()] = 1
        all_awards = pd.concat([all_awards, df], ignore_index=True)
        all_awards.fillna(0, inplace = True)
        df[award.upper()].astype(int)
    return all_awards

In [None]:
#get awards dataframe
awards = ['mvp', 'dpoy', 'smoy', 'mip']
df = get_award(awards)

In [59]:
#merge all_nba and awards dataframes
all_awards = pd.merge(df, df_all_nba, how = 'outer', on = ['season', 'name']).fillna(0)
all_awards

Unnamed: 0,season,name,MVP,DPOY,SMOY,MIP,team_1st,team_2nd,team_3rd
0,2023-24,Nikola Jokić,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2022-23,Joel Embiid,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2021-22,Nikola Jokić,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2020-21,Nikola Jokić,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2019-20,Giannis Antetokounmpo,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1132,1946-47,Frankie Baumholtz,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1133,1946-47,Ernie Calverley,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1134,1946-47,Chick Halbert,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1135,1946-47,John Logan,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Putting together the scrape year function

In [65]:
def scrape_year(year, award_list):
    """
    Scrape and combine basketball player statistics and awards for a specific year.

    This function scrapes and combines player statistics and awards for a specific year, including total stats, advanced stats, and per game stats. It then merges these dataframes and saves the result to a CSV file.

    Parameters:
    -----------
    year : int
        The ending year of the season for which to retrieve the data to be scraped and combined.
    award_list : list of str
        A list of strings containing the names of awards.

    Returns:
    --------
    pandas.DataFrame
        The combined dataframe containing player statistics and awards for the specified year.

    Example:
    --------
    To scrape and combine player stats and the MVP and Rookie of the Year awards for the 2022-2023 season:
    >>> scrape_year(2023, ['mvp', 'roy'])
    """
    #scrape and clean total stats
    get_player_total_season_stats(year)
    df_total = pd.read_csv(f"./total_stats/{year-1}_{year}_player_season_totals.csv")
    df_total = combine_traded_player_total_stats(df_total)
    df_total = clean_total_stats(df_total)
    #scrape and clean advanced stats
    get_player_advanced_season_stats(year)
    df_advanced = pd.read_csv(f"./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv")
    df_advanced = combine_traded_player_advanced_stats(df_advanced)
    #scrape and clean per year stats
    get_per_game_stat(year)
    df_per = pd.read_csv(f'./per_game_stat/per_game_stat_{year-1}_{year}.csv')
    df_per = combine_traded_player_per_game_stats(df_per)
    
    #combine the total and advanced stats dataframes
    df = pd.merge(df_advanced, df_total, on = 'name')
    #combine the total and per year stats dataframes
    df = pd.merge(df_per, df, on = 'name')
    df = pd.get_dummies(df, columns=['positions'], dtype = int)
    #adding season column for merge purposes with awards
    df['season'] = f'{year-1}-{str(year)[-2:]}'
    
    
    for award in award_list:
    #scrape awards
        accolade = get_award(award_list)
    all_awards = pd.merge(accolade, df_all_nba, how = 'outer', on = ['season', 'name']).fillna(0)
    #merge award and big df
    df = pd.merge(df, all_awards, how = 'left', on = ['season', 'name'])
    df.fillna(0, inplace = True)    
    
    df.to_csv(f'./total/total_{year-1}_{year}.csv', index = False)
    return df

Not running the below cell again as I have already scraped the data needed.

In [None]:
#scraping every season since 1979-1980 as that was the first year of the three point line. need to make the sleep time 60 because basketball reference has a strict per minute scrape policy
for i in range(1980, 2024):
    award_list = ['mvp', 'dpoy', 'smoy', 'mip']
    scrape_year(i, award_list)
    time.sleep(60)