# Imports

In [1]:
#from basketball_reference web_scraper 
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# Scraping Player Stats for a Total Season

In [253]:
def get_player_total_season_stats(year):
    """
    Retrieves player season total stats for a given NBA season and saves them to a CSV file.
    
    This function uses an API wrapper to fetch player season total statistics for the specified NBA season, saves the data to a CSV file, drops the 'slug' column from the dataset, and then saves the updated dataset back to the CSV file.
    
    Parameters:
    -----------
    year : int
        The ending year of the NBA season for which to retrieve player statistics. For example, to get statistics for the 2022-2023 season, pass 2023 as the year.
    
    Returns:
    --------
    None
        It saves the player season total statistics to a CSV file.

    Example:
    --------
    To retrieve and save player statistics for the 2022-2023 NBA season:
    >>> get_player_total_season_stats(2023)
    """
    #from the api wrapper
    client.players_season_totals(
        season_end_year= year, 
        output_type=OutputType.CSV, 
        output_file_path=f"./total_stats/{year-1}_{year}_player_season_totals.csv"
    )
    #dropping slug column
    df = pd.read_csv(f'./total_stats/{year-1}_{year}_player_season_totals.csv')
    df.drop(columns = 'slug', inplace = True)
    df['team'].fillna('San Diego Clippers', inplace=True)
    df.to_csv(f"./total_stats/{year-1}_{year}_player_season_totals.csv", index = False)
    return df

# 2022/2023 Season Total Stats EDA

Not running the below cell again as I have already scraped the data.

In [256]:
df_total = get_player_total_season_stats(2023)

In [257]:
#eda for player totals
df_total.isnull().sum()

name                                 0
positions                            0
age                                  0
team                                 0
games_played                         0
games_started                        0
minutes_played                       0
made_field_goals                     0
attempted_field_goals                0
made_three_point_field_goals         0
attempted_three_point_field_goals    0
made_free_throws                     0
attempted_free_throws                0
offensive_rebounds                   0
defensive_rebounds                   0
assists                              0
steals                               0
blocks                               0
turnovers                            0
personal_fouls                       0
points                               0
dtype: int64

No null values.

In [237]:
df_total['team'].value_counts()

team
UTAH JAZZ                 20
WASHINGTON BULLETS        17
DETROIT PISTONS           17
INDIANA PACERS            16
NEW JERSEY NETS           16
LOS ANGELES LAKERS        15
PORTLAND TRAIL BLAZERS    15
CLEVELAND CAVALIERS       15
ATLANTA HAWKS             15
DENVER NUGGETS            15
SAN ANTONIO SPURS         15
HOUSTON ROCKETS           15
CHICAGO BULLS             15
GOLDEN STATE WARRIORS     14
PHILADELPHIA 76ERS        14
KANSAS CITY KINGS         14
NEW YORK KNICKS           13
MILWAUKEE BUCKS           12
BOSTON CELTICS            12
PHOENIX SUNS              12
SEATTLE SUPERSONICS       11
Name: count, dtype: int64

In [238]:
df_total.describe()

Unnamed: 0,age,games_played,games_started,minutes_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
count,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0
mean,26.167183,54.789474,9.393189,1350.650155,243.76161,506.256966,4.343653,15.489164,118.780186,155.455108,84.102167,166.873065,144.170279,52.609907,29.600619,104.012384,136.074303,610.647059
std,3.236521,27.282834,24.875033,947.129504,209.717531,416.488991,11.298993,32.572171,114.140147,145.123358,78.698845,159.529586,141.010017,46.415717,41.422961,78.923423,91.370827,526.854612
min,19.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24.0,28.0,0.0,448.5,69.0,153.0,0.0,1.0,32.0,44.0,24.0,45.0,38.0,15.0,5.0,36.0,54.0,177.0
50%,26.0,67.0,0.0,1286.0,191.0,419.0,1.0,4.0,92.0,121.0,63.0,121.0,106.0,45.0,14.0,95.0,128.0,486.0
75%,28.0,79.5,0.0,2124.0,370.5,760.0,3.0,12.5,171.5,224.5,130.5,240.5,208.0,76.5,37.0,158.5,208.0,913.5
max,36.0,82.0,82.0,3226.0,1024.0,1940.0,90.0,239.0,572.0,783.0,573.0,864.0,832.0,265.0,280.0,359.0,328.0,2585.0


Nothing out of the ordinary.

In [239]:
df_total.shape

(323, 21)

In [240]:
df_total['name'].nunique()

286

It seems we have duplicate players if they got traded midseason. We need to combine these statistics.

In [241]:
df_total['name'].value_counts()

name
John Shumate      3
Steve Malovic     3
Kenny Carr        2
Brad Davis        2
George Johnson    2
                 ..
Bob Gross         1
Ernie Grunfeld    1
Roy Hamilton      1
James Hardy       1
Tony Zeno         1
Name: count, Length: 286, dtype: int64

In [258]:
def combine_traded_player_total_stats(df):
    """
    Combines statistics for NBA players who were traded during the season for the player season total stats dataframe.
    
    This function identifies players who were traded during the season (i.e., appear more than once in the dataset with different teams), combines their statistics into a single row per player, and updates the DataFrame accordingly.
    
    Parameters:
    -----------
    df: pandas.DataFrame
        A DataFrame containing player total stats for a season.
        
    Returns:
    --------
    df: pandas.DataFrame
        The updated DataFrame with combined statistics for traded players.
        
    Example:
    --------
    To combine the stats for players who were traded during the 2022-2023 season:
    >>> df = pd.read_csv('2022_2023_player_season_totals.csv')
    >>> df_combined = combine_traded_player_total_stats(df)
    """
    player = df['name'].value_counts()
    multi_player = player[player > 1].index
    #create a new dataframe that just has the duplicate payers in it
    multi_df = df[df['name'].isin(multi_player)]
    df = df[df['name'].apply(lambda x: x not in multi_player)]

    #combine the stats from the duplicate players
    combo_stats = multi_df.groupby('name')[['games_played', 'games_started', 'minutes_played', 'made_field_goals', 'attempted_field_goals', 'made_three_point_field_goals', 'attempted_three_point_field_goals', 'made_free_throws', 'attempted_free_throws', 'offensive_rebounds', 'defensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points']].sum().reset_index()
    team_names = multi_df.groupby('name')['team'].apply(list).reset_index()
    team_names['team_1'] = team_names['team'].apply(lambda x: x[0])
    team_names['team_2'] = team_names['team'].apply(lambda x: x[1] if len(x) > 1 else None)
    team_names = team_names.drop(columns='team')

    # Merge the combined stats with team names
    combo_df = pd.merge(combo_stats, team_names, on='name')
    #only take the first position since it should be the same
    combo_positions = multi_df.groupby('name')['positions'].first().reset_index()
    #combine the dataframes
    combo_df = pd.merge(combo_df, combo_positions, on='name')
    #only take the first age since we dont want to sum them
    age = multi_df.groupby('name')['age'].first().reset_index()
    #combine dataframe
    combo_df = pd.merge(combo_df, age, on='name')
    #concat the dataframe with the combined stats with the old dataframe.
    df = pd.concat([df, combo_df], axis = 0).reset_index(drop = True)
    return df

In [259]:
df_total = combine_traded_player_total_stats(df_total)

In [260]:
df_total

Unnamed: 0,name,positions,age,team,games_played,games_started,minutes_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,team_1,team_2
0,Precious Achiuwa,CENTER,23,TORONTO RAPTORS,55,12,1140,196,404,29,...,100,228,50,31,30,59,102,508,,
1,Steven Adams,CENTER,29,MEMPHIS GRIZZLIES,42,42,1133,157,263,0,...,214,271,97,36,46,79,98,361,,
2,Bam Adebayo,CENTER,25,MIAMI HEAT,75,75,2598,602,1114,1,...,184,504,240,88,61,187,208,1529,,
3,Ochai Agbaji,SHOOTING GUARD,22,UTAH JAZZ,59,22,1209,165,386,81,...,43,78,67,16,15,41,99,467,,
4,Santi Aldama,POWER FORWARD,22,MEMPHIS GRIZZLIES,77,20,1682,247,525,94,...,85,286,97,45,48,60,143,696,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,T.J. Warren,POWER FORWARD,29,,42,0,687,132,270,22,...,26,95,40,22,12,20,65,314,BROOKLYN NETS,PHOENIX SUNS
535,Terrence Ross,SHOOTING GUARD,31,,63,9,1330,200,465,95,...,22,132,95,35,10,43,97,522,ORLANDO MAGIC,PHOENIX SUNS
536,Terry Taylor,POWER FORWARD,23,,31,2,264,39,75,5,...,27,20,10,3,6,9,26,89,INDIANA PACERS,CHICAGO BULLS
537,Thomas Bryant,CENTER,25,,59,26,1081,230,369,26,...,86,253,31,16,30,38,102,579,LOS ANGELES LAKERS,DENVER NUGGETS


In [261]:
def clean_total_stats(df):
    """
    Cleans the 'team' and 'positions' columns in the player total stats DataFrame.

    This function converts the values in the 'team' and 'positions' columns to title case to ensure consistent formatting.

    Parameters:
    -----------
    df: pandas.DataFrame
        A DataFrame containing player total stats for a season.
    Returns:
    --------
    df: pandas.DataFrame
        The updated DataFrame with the 'team' and 'positions' columns converted to title case.

    Example:
    --------
    To clean the 'team' and 'positions' columns in the DataFrame:
    >>> df = pd.read_csv('2022_2023_player_season_totals.csv')
    >>> df_cleaned = clean_total_stats(df)
    """
    #make the positions and team columns title case
    df['team'] = df['team'].str.title()
    df['team_1'] = df['team_1'].str.title()
    df['team_2'] = df['team_2'].str.title()
    df['positions'] = df['positions'].str.title()
    
    #Deal with 76Ers and correcting it to 76ers
    misspell = {
        'Philadelphia 76Ers':'Philadelphia 76ers',
        'Seattle Supersonics': 'Seattle SuperSonics'
    }
    df['team'] = df['team'].replace(misspell)
    df['team_1'] = df['team_1'].replace(misspell)
    df['team_2'] = df['team_2'].replace(misspell)
    return df

In [262]:
#clean the dataframe
df_total = clean_total_stats(df_total)


In [263]:
df_total['team'].value_counts()

team
Sacramento Kings          19
Washington Wizards        19
Portland Trail Blazers    18
San Antonio Spurs         18
Dallas Mavericks          17
Memphis Grizzlies         17
Toronto Raptors           17
Utah Jazz                 17
Milwaukee Bucks           17
Cleveland Cavaliers       16
Miami Heat                16
Boston Celtics            16
Indiana Pacers            16
Orlando Magic             16
Philadelphia 76ers        16
Charlotte Hornets         16
Golden State Warriors     16
Detroit Pistons           16
Atlanta Hawks             15
Chicago Bulls             15
New Orleans Pelicans      15
Oklahoma City Thunder     15
Minnesota Timberwolves    15
Los Angeles Clippers      14
Denver Nuggets            14
Houston Rockets           14
New York Knicks           13
Brooklyn Nets             13
Phoenix Suns              12
Los Angeles Lakers        11
Name: count, dtype: int64

# Scraping Player Advanced Stats

In [68]:
def get_player_advanced_season_stats(year):
    """
    Retrieves and processes advanced player stats for a given season.

    This function uses an API to retrieve advanced player stats for the specified season, saves the data to a CSV file, and then processes the data by removing unnecessary columns.

    Parameters:
    -----------
    year : int
        The ending year of the season for which to retrieve the advanced player statistics.

    Returns:
    --------
    None
        It saves the player season total statistics to a CSV file.

    Example:
    --------
    To retrieve and save advanced player statistics for the 2022-2023 season:
    >>> get_player_advanced_season_stats(2023)
    """
    client.players_advanced_season_totals(
        season_end_year=year,
        output_type=OutputType.CSV,
        output_file_path=f"./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv"
)
#dropping duplicate and not needed columns 
    df = pd.read_csv(f'./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv')
    df.drop(columns = ['slug', 'positions', 'age', 'team', 'minutes_played', 'is_combined_totals'] , inplace = True)
    return df.to_csv(f"./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv", index = False)

Not running the below cell again as I have already scraped the data.

# Player Advanced Stats EDA

In [69]:
df_advanced = pd.read_csv('./advanced_season_stat_total/2022_2023_advanced_player_season_totals.csv')
df_advanced.isnull().sum()

name                             0
games_played                     0
player_efficiency_rating         0
true_shooting_percentage         0
three_point_attempt_rate         0
free_throw_attempt_rate          0
offensive_rebound_percentage     0
defensive_rebound_percentage     0
total_rebound_percentage         0
assist_percentage                0
steal_percentage                 0
block_percentage                 0
turnover_percentage              0
usage_percentage                 0
offensive_win_shares             0
defensive_win_shares             0
win_shares                       0
win_shares_per_48_minutes        0
offensive_box_plus_minus         0
defensive_box_plus_minus         0
box_plus_minus                   0
value_over_replacement_player    0
dtype: int64

No null values present.

In [70]:
df_advanced.describe()

Unnamed: 0,games_played,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,assist_percentage,steal_percentage,...,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player
count,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,...,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0,609.0
mean,42.518883,13.263218,0.55992,0.405148,0.245982,5.172578,14.99688,10.082102,13.333333,1.54335,...,12.622003,18.382759,1.074713,0.990312,2.065846,0.081337,-1.374548,-0.075041,-1.450082,0.487521
std,25.081185,6.237213,0.114687,0.222474,0.181543,4.280231,6.837965,4.806279,8.735297,1.316029,...,7.884842,5.835549,1.688998,0.965262,2.460253,0.088172,3.607156,2.131476,4.774525,1.141405
min,1.0,-20.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.9,0.0,-1.6,-0.517,-22.5,-10.4,-26.5,-1.3
25%,21.0,10.0,0.522,0.268,0.138,2.1,10.7,6.7,7.3,1.0,...,9.2,14.3,0.0,0.2,0.2,0.045,-3.0,-0.9,-3.3,-0.1
50%,43.0,13.0,0.566,0.414,0.227,3.8,13.4,8.9,11.1,1.4,...,11.8,17.7,0.5,0.7,1.2,0.086,-1.4,-0.1,-1.3,0.1
75%,65.0,16.3,0.61,0.553,0.323,7.1,18.7,12.6,17.7,1.8,...,15.1,21.3,1.5,1.6,3.1,0.129,0.5,0.8,0.6,0.7
max,82.0,65.6,1.064,1.0,2.0,28.8,55.4,29.6,47.6,24.2,...,100.0,52.5,11.2,4.8,14.9,0.626,17.0,32.7,48.6,8.8


Nothing out of the ordinary.

In [71]:
df_advanced.shape

(609, 22)

In [72]:
df_advanced['name'].nunique()

539

Same issue as before. We have duplicate rows for players that were traded midseason.

In [73]:
def combine_traded_player_advanced_stats(df):
    """
    Combines advanced stats for players who were traded during a season.

    This function takes a DataFrame containing advanced player stats and combines the stats for players who were traded during the season. It then calculates the weighted average of the combined stats based on the number of games played on each team.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing advanced player statistics.

    Returns:
    --------
    pandas.DataFrame
        DataFrame with combined/weighted averaged advanced player statistics for players who were traded during the season.

    Example:
    --------
    To combine the stats for players who were traded during the 2022-2023 season:
    >>> df = pd.read_csv('2022_2023_advanced_player_season_totals.csv')
    >>> df_combined = combine_traded_player_total_stats(df)
    """
    player = df['name'].value_counts()
    multi_player = player[player > 1].index
    #create a new dataframe that just has the duplicate payers in it
    multi_df = df[df['name'].isin(multi_player)]
    df = df[df['name'].apply(lambda x: x not in multi_player)]

    #combine the stats from the duplicate players
    combo_stats = multi_df.groupby('name')[['player_efficiency_rating', 'true_shooting_percentage', 'three_point_attempt_rate', 'free_throw_attempt_rate', 'offensive_rebound_percentage', 'defensive_rebound_percentage', 'total_rebound_percentage', 'assist_percentage', 'steal_percentage', 'block_percentage', 'turnover_percentage', 'usage_percentage', 'offensive_win_shares', 'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes', 'offensive_box_plus_minus', 'defensive_box_plus_minus', 'box_plus_minus', 'value_over_replacement_player']].apply(lambda x: x.multiply(multi_df.loc[x.index, 'games_played'], axis=0)).groupby('name').sum()
    #combine total games for each player
    total_games = multi_df.groupby('name')['games_played'].sum()
    
    #calculate the averages
    combo_stats = combo_stats.divide(total_games, axis=0).reset_index()
    #concat the dataframe with the combined stats with the old dataframe.
    df = pd.concat([df, combo_stats], axis = 0).reset_index(drop = True)
    df.drop(columns = 'games_played', inplace = True)
    return df

In [74]:
df_advanced = combine_traded_player_advanced_stats(df_advanced)

In [75]:
#combine the total and advanced stats dataframes
df = pd.merge(df_advanced, df_total, on = 'name')

In [76]:
#confirming the correct shape
df.shape

(539, 43)

# Scrape Player Per Game Stats Per Season

In [77]:
def get_per_game_stat(year):
    """
    Scrapes per-game basketball stats from Basketball Reference for a specific year and saves to as a pandas DataFrame.

    This function scrapes per-game stats of NBA players from Basketball Reference for a specified year. It retrieves stats such as player name, minutes per game (mpg), field goals made per game (fgm_per_g),
    field goals attempted per game (fga_per_g), field goal percentage (fg_pct), three-point field goals made per game (fg3m_per_g), three-point field goals attempted per game (fg3a_per_g), three-point field goal percentage (fg3_pct),
    two-point field goals made per game (fg2m_per_g), two-point field goals attempted per game (fg2a_per_g), two-point field goal percentage (fg2_pct), effective field goal percentage (efg_pct), free throws made per game (ftm_per_g), 
    free throws attempted per game (fta_per_g), free throw percentage (ft_pct), offensive rebounds per game (orb_per_g), defensive rebounds per game (drb_per_g), total rebounds per game (trb_per_g), assists per game (ast_per_g),
    steals per game (stl_per_g), blocks per game (blk_per_g), turnovers per game (tov_per_g), personal fouls per game (pf_per_g), and points per game (ppg).

    Parameters:
    -----------
    year : int
        The ending year of the season for which to retrieve the per-game stats.

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing per-game stats for NBA players in the specified year.

    Example:
    --------
    To scrape per-game stats for the year 2022-2023:
    >>> per_game_stats_2022 = get_per_game_stat(2023)
    """
    req = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html')
    soup = BeautifulSoup(req.content, 'html.parser')
    per_game_stat = []
    
    #the stats that I want in my dataframe
    stats = {
        'player': 'name',
        'mp_per_g' : 'mpg',
        'fg_per_g' : 'fgm_per_g',
        'fga_per_g' : 'fga_per_g',
        'fg_pct' : 'fg_pct',
        'fg3_per_g' : 'fg3m_per_g',
        'fg3a_per_g' : 'fg3a_per_g',
        'fg3_pct' : 'fg3_pct',
        'fg2_per_g' : 'fg2m_per_g',
        'fg2a_per_g' : 'fg2a_per_g',
        'fg2_pct' : 'fg2_pct',
        'efg_pct' : 'efg_pct',
        'ft_per_g' : 'ftm_per_g',
        'fta_per_g' : 'fta_per_g',
        'ft_pct' : 'ft_pct',
        'orb_per_g': 'orb_per_g',
        'drb_per_g' : 'drb_per_g',
        'trb_per_g' : 'trb_per_g',
        'ast_per_g' : 'ast_per_g',
        'stl_per_g' : 'stl_per_g',
        'blk_per_g' : 'blk_per_g',
        'tov_per_g' : 'tov_per_g',
        'pf_per_g' : 'pf_per_g',
        'pts_per_g' : 'ppg',
        #added team so that I can deal with traded players data
        'team_id': 'team'
    }
        
    #finding each player and their specific stat
    for stat in soup.find('tbody').find_all('tr'):
        per_game_stat_dict = {}
        player = stat.find('td', {'data-stat':'player'})
        #need to add if statement so it only recognizes player names that are filled out
        if player:
            player_found = player.find('a')
            if player_found:
                per_game_stat_dict['name'] = player_found.getText()
        #need to add a for loop to loop through the stats and find each key in the html dictionary
        for key, value in stats.items():
            if key != 'player':
                stat_value = stat.find('td', {'data-stat': key})
                if stat_value:
                    per_game_stat_dict[value] = stat_value.getText()  
        per_game_stat.append(per_game_stat_dict)
    #create a dataframe of our stats and drop any null values
    df_per_game = pd.DataFrame(per_game_stat)
    df_per_game.reset_index(drop=True, inplace=True)
    df_per_game.to_csv(f'./per_game_stat/per_game_stat_{year-1}_{year}.csv', index = False)
    return df_per_game

# EDA on Per Game Stats

Not running the below cell again as I have already scraped the data.

In [78]:
df_per = pd.read_csv('./per_game_stat/per_game_stat_2022_2023.csv')

In [79]:
def combine_traded_player_per_game_stats(df):
    """
    Combines per-game stats of players who were traded during the season.

    This function takes a DataFrame containing per-game stats and combines the stats of players who were traded during the season. It identifies players with multiple entries (indicating they were traded) 
    and combines their stats into a single entry.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing per-game stats of NBA players.

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing per-game stats with combined stats for traded players.

    Example:
    --------
    To combine the per-game stats for players who were traded during the 2022-2023 season:
    >>> df = pd.read_csv('per_game_stat_2022_2023.csv')
    >>> df_combined = combine_traded_player_per_game_stats(df)
    """
    #Identify players that show up multiple times
    player = df['name'].value_counts()
    multi_player = player[player > 1].index
    #create a new dataframe that just has the duplicate payers in it
    multi_df = df[df['name'].isin(multi_player)]
    df = df[df['name'].apply(lambda x: x not in multi_player)]
    grouped = multi_df.groupby('name')
    
    total_list = []
    #If the team is "total" then grab those values
    for player, stats in grouped:
        if 'TOT' in stats['team'].values:
            total = stats[stats['team'] == 'TOT']
            total_list.append(total)
            
    total = pd.concat(total_list, axis = 0)
    
    df = pd.concat([total, df], axis = 0).reset_index(drop = True)
    df.drop(columns = 'team', inplace = True)
    return df

In [80]:
df_per = combine_traded_player_per_game_stats(df_per)

# Scrape NBA Team Data

In [321]:
req = requests.get(f'https://www.basketball-reference.com/leagues/NBA_1980.html')
soup = BeautifulSoup(req.content, 'html.parser')
soup


<!DOCTYPE html>

<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
<link href="https://cdn.ssref.net/req/202406111" rel="dns-prefetch"/>
<!-- yes-inmobi-ssi -->
<!-- InMobi Choice. Consent Manager Tag v3.0 (for TCF 2.2) -->
<script async="true" type="text/javascript">
(function() {
  var host = window.location.hostname;
  var element = document.createElement('script');
  var firstScript = document.getElementsByTagName('script')[0];
  var url = 'https://cmp.inmobi.com'
    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, '/choice.js?tag_version=V3');
  var uspTries = 0;
  var uspTriesLimit = 3;
  element.async = true;
  element.type = 'text/javascript';
  element.src = url;

  firstScript.parentNode.insertBefore(element, firstScript);

  function makeStub() {
    var TCF_LOCATOR_NAM

In [62]:
req = requests.get(f'https://www.basketball-reference.com/leagues/NBA_1980.html')
soup = BeautifulSoup(req.content, 'html.parser')
team_stats = []
table_ids = ['per_game-team']
    #Loop through both conferences
for tables in table_ids:
    table = soup.find('table', {'id': tables})
    if table:
        headers = [th.getText() for th in table.find('thead').find_all('th')]
        # Looping through the team names
        for stat in table.find('tbody').find_all('tr'):
            team_stat_dicts = {}
            team_name_tag = stat.find('a')
            if team_name_tag:
                team_name = re.sub(r'[^\w\s]+', '', team_name_tag.text)
                team_stat_dicts['team'] = team_name
                #Loop through the stats for each team
                for header in headers:
                    if header.lower() == 'team':
                        continue
                    header = header.lower().replace(' ', '_').replace('%', 'pct')
                    stat_value = stat.find('td', {'data-stat': header})
                    if stat_value:
                        team_stat_dicts[header] = stat_value.getText()
                #Append back to our list
                team_stats.append(team_stat_dicts)
                

In [108]:
def scrape_team_data(year):
    #Scrape Basktball Reference
    req = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{year}.html')
    soup = BeautifulSoup(req.content, 'html.parser')
    team_stat = []
    
    #Specify the stats we are interested in
    stats = {
        'wins': 'wins',
        'losses': 'losses',
        'win_loss_pct': 'win_loss_pct',
        'pts_per_g': 'team_pts_per_g',
        'opp_pts_per_g': 'opp_pts_per_g',
        'srs': 'team_simple_rating_system'
    }
    
    champion = None
    for p in soup.find_all('p'):
        if 'League Champion' in p.text:
            champion = p.find('a').text
    
    #Loop through both conferences
    for conference in ['E', 'W']:
        conference_table = soup.find('table', {'id': f'divs_standings_{conference}'})
        # Looping through the team names
        if conference_table:
            for stat in conference_table.find('tbody').find_all('tr'):
                team_stat_dict = {}
                team_name_tag = stat.find('a')
                if team_name_tag:
                    team_name = team_name_tag.text
                    team_stat_dict['team'] = team_name
                #Loop through the stats for each team
                    for key, value in stats.items():
                        stat_value = stat.find('td', {'data-stat': key})
                        if stat_value:
                            team_stat_dict[value] = stat_value.getText()
                        else:
                            team_stat_dict[key] = None
                #Append back to our list
                    team_stat.append(team_stat_dict)
    
    team_stats = []
    table_ids = ['per_game-team', 'totals-team']
    #Loop through both conferences
    for tables in table_ids:
        table = soup.find('table', {'id': tables})
        if table:
            headers = [th.getText() for th in table.find('thead').find_all('th')]
            # Looping through the team names
            for stat in table.find('tbody').find_all('tr'):
                team_stat_dicts = {}
                team_name_tag = stat.find('a')
                if team_name_tag:
                    team_name = re.sub(r'[^\w\s]+', '', team_name_tag.text)
                    team_stat_dicts['team'] = team_name
                    #Loop through the stats for each team
                    for header in headers:
                        if header.lower() == 'team':
                            continue
                        header = header.lower().replace(' ', '_').replace('%', 'pct')
                        stat_value = stat.find('td', {'data-stat': header})
                        if stat_value:
                            team_stat_dicts[header] = stat_value.getText()
                #Append back to our list
                    team_stats.append(team_stat_dicts)
    
    #Create dataframe
    df_team = pd.DataFrame(team_stat)
    df_per_game_team = pd.DataFrame(team_stats)
    df_team = pd.merge(df_team, df_per_game_team, on = 'team')
    # Add a column for the champion
    df_team['champion'] = df_team['team'].apply(lambda x: 1 if x == champion else 0)
    df_team.reset_index(drop=True, inplace=True)
    df_team.to_csv(f'./team_stats/team_stats_{year-1}_{year}.csv', index=False)
    return df_team

def scrape_team_data(year):
    #Scrape Basktball Reference
    req = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{year}.html')
    soup = BeautifulSoup(req.content, 'html.parser')
    
    champion = None
    for p in soup.find_all('p'):
        if 'League Champion' in p.text:
            champion = p.find('a').text
    
    team_stats = []
    table_ids = ['divs_standings_E', 'divs_standings_W', 'per_game-team']
    #Loop through both conferences
    for table_id in table_ids:
        table = soup.find('table', {'id': table_id})
        # Looping through the team names
        if table:
            headers = [th.getText() for th in table.find('thead').find_all('th')]
            for stat in table.find('tbody').find_all('tr'):
                team_stat_dicts = {}
                team_name_tag = stat.find('a')
                if team_name_tag:
                    team_name = re.sub(r'[^\w\s]+', '', team_name_tag.text)
                    team_stat_dicts['team'] = team_name
                #Loop through the stats for each team
                    for header in headers:
                        if header.lower() == 'team':
                            continue
                        header = header.lower().replace(' ', '_').replace('%', 'pct')
                        stat_value = stat.find('td', {'data-stat': header})
                        if stat_value:
                            team_stat_dicts[header] = stat_value.getText()
                #Append back to our list
                team_stats.append(team_stat_dicts)
    
    #Create dataframe
    df_team = pd.DataFrame(team_stats)
    # Add a column for the champion
    df_team['champion'] = df_team['team'].apply(lambda x: 1 if x == champion else 0)
    df_team.reset_index(drop=True, inplace=True)
    df_team.to_csv(f'./team_stats/team_stats_{year-1}_{year}.csv', index=False)
    return df_team

In [109]:
scrape_team_data(2005)

Unnamed: 0,team,wins,losses,win_loss_pct,team_pts_per_g,opp_pts_per_g,team_simple_rating_system,g,mp,fg,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,champion
0,Boston Celtics,45,37,0.549,101.3,100.4,0.35,82,242.4,37.1,...,11.1,29.7,40.8,22.1,8.1,5.2,15.8,24.4,101.3,0
1,Boston Celtics,45,37,0.549,101.3,100.4,0.35,82,19880.0,3046.0,...,909.0,2438.0,3347.0,1810.0,667.0,423.0,1297.0,2000.0,8304.0,0
2,Philadelphia 76ers,43,39,0.524,99.1,99.9,-1.07,82,242.1,35.9,...,11.1,30.9,42.0,20.9,9.2,3.9,15.5,22.9,99.1,0
3,Philadelphia 76ers,43,39,0.524,99.1,99.9,-1.07,82,19855.0,2946.0,...,909.0,2536.0,3445.0,1710.0,756.0,321.0,1274.0,1878.0,8128.0,0
4,New Jersey Nets,42,40,0.512,91.4,92.9,-1.82,82,242.7,33.6,...,10.4,29.1,39.5,21.6,7.9,3.8,14.2,24.2,91.4,0
5,New Jersey Nets,42,40,0.512,91.4,92.9,-1.82,82,19905.0,2753.0,...,855.0,2387.0,3242.0,1772.0,650.0,308.0,1164.0,1985.0,7496.0,0
6,Toronto Raptors,33,49,0.402,99.7,101.4,-1.81,82,241.5,36.0,...,10.3,29.8,40.1,20.4,7.6,3.9,13.3,22.9,99.7,0
7,Toronto Raptors,33,49,0.402,99.7,101.4,-1.81,82,19805.0,2952.0,...,844.0,2444.0,3288.0,1670.0,621.0,317.0,1087.0,1876.0,8178.0,0
8,New York Knicks,33,49,0.402,97.3,99.7,-2.72,82,242.4,36.3,...,11.8,29.2,41.0,20.3,7.7,3.2,14.7,23.7,97.3,0
9,New York Knicks,33,49,0.402,97.3,99.7,-2.72,82,19880.0,2978.0,...,965.0,2393.0,3358.0,1665.0,629.0,260.0,1204.0,1942.0,7977.0,0


# Combine to Make One DataFrame

In [83]:
#combine the total and per year stats dataframes
df = pd.merge(df_per, df, on = 'name')

In [84]:
#Need to one hot encode positions
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539 entries, 0 to 538
Data columns (total 66 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   name                               539 non-null    object 
 1   mpg                                539 non-null    float64
 2   fgm_per_g                          539 non-null    float64
 3   fga_per_g                          539 non-null    float64
 4   fg_pct                             537 non-null    float64
 5   fg3m_per_g                         539 non-null    float64
 6   fg3a_per_g                         539 non-null    float64
 7   fg3_pct                            523 non-null    float64
 8   fg2m_per_g                         539 non-null    float64
 9   fg2a_per_g                         539 non-null    float64
 10  fg2_pct                            534 non-null    float64
 11  efg_pct                            537 non-null    float64

In [85]:
df = pd.get_dummies(df, columns=['positions'], dtype = int)
df

Unnamed: 0,name,mpg,fgm_per_g,fga_per_g,fg_pct,fg3m_per_g,fg3a_per_g,fg3_pct,fg2m_per_g,fg2a_per_g,...,turnovers,personal_fouls,points,team_1,team_2,positions_Center,positions_Point Guard,positions_Power Forward,positions_Shooting Guard,positions_Small Forward
0,A.J. Lawson,7.2,1.5,2.9,0.500,0.7,1.7,0.400,0.8,1.3,...,3,11,56,Minnesota Timberwolves,Dallas Mavericks,0,0,0,1,0
1,Bones Hyland,19.4,4.1,10.2,0.399,2.1,5.6,0.371,2.0,4.6,...,82,96,659,Denver Nuggets,Los Angeles Clippers,0,1,0,0,0
2,Bruno Fernando,10.4,1.5,2.9,0.527,0.0,0.1,0.000,1.5,2.8,...,25,74,153,Houston Rockets,Atlanta Hawks,1,0,0,0,0
3,Cam Reddish,24.8,3.5,7.8,0.446,1.1,3.6,0.313,2.4,4.2,...,47,72,387,New York Knicks,Portland Trail Blazers,0,0,0,0,1
4,Cameron Johnson,28.5,5.3,11.3,0.470,2.5,6.1,0.404,2.9,5.2,...,37,83,650,Phoenix Suns,Brooklyn Nets,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,Thaddeus Young,14.7,2.0,3.7,0.545,0.1,0.6,0.176,1.9,3.0,...,42,88,240,,,0,0,1,0,0
535,Trae Young,34.8,8.2,19.0,0.429,2.1,6.3,0.335,6.1,12.7,...,300,104,1914,,,0,1,0,0,0
536,Omer Yurtseven,9.2,1.8,3.0,0.593,0.3,0.8,0.429,1.4,2.2,...,4,16,40,,,1,0,0,0,0
537,Cody Zeller,14.5,2.5,3.9,0.627,0.0,0.1,0.000,2.5,3.8,...,14,33,98,,,1,0,0,0,0


In [86]:
#merge the players who werent traded with the df_team
df_merge = pd.merge(df.dropna(subset=['team']), df_team, on='team', how = 'outer')

In [87]:
def combine_traded_player_team_data(df, df_team, df_merge):
    df_traded = df[df['team'].isna()]
    # Merge player_df with team_stats_df for both team_1 and team_2
    df_traded = pd.merge(df_traded, df_team, left_on='team_1', right_on='team')
    df_traded = pd.merge(df_traded, df_team, left_on='team_2', right_on='team', suffixes=('', '_team_2'))

    columns = ['wins', 'wins_team_2', 'losses', 'losses_team_2', 'win_loss_pct', 'win_loss_pct_team_2', 'team_pts_per_g', 'team_pts_per_g_team_2', 'opp_pts_per_g', 'opp_pts_per_g_team_2', 'team_simple_rating_system', 'team_simple_rating_system_team_2']
    df_traded[columns] = df_traded[columns].astype(float)

    # Calculate the average stats
    df_traded['wins'] = (df_traded['wins'] + df_traded['wins_team_2']) / 2
    df_traded['losses'] = (df_traded['losses'] + df_traded['losses_team_2']) / 2
    df_traded['win_loss_pct'] = (df_traded['win_loss_pct'] + df_traded['win_loss_pct_team_2']) / 2
    df_traded['team_pts_per_g'] = (df_traded['team_pts_per_g'] + df_traded['team_pts_per_g_team_2']) / 2
    df_traded['opp_pts_per_g'] = (df_traded['opp_pts_per_g'] + df_traded['opp_pts_per_g_team_2']) / 2
    df_traded['team_simple_rating_system'] = (df_traded['team_simple_rating_system'] + df_traded['team_simple_rating_system_team_2']) / 2

    df_traded['team'] = df_traded.apply(lambda row: ', '.join([str(row['team_1']), str(row['team_2'])]), axis=1)
    df_traded.drop(columns = ['team_1', 'team_2', 'team_y', 'wins_team_2', 'losses_team_2', 'win_loss_pct_team_2', 'team_pts_per_g_team_2', 'opp_pts_per_g_team_2', 'team_simple_rating_system_team_2'], inplace = True)

    #concat the df merge with the players who were traded
    df_merge = pd.concat([df_merge, df_traded])
    df_merge.drop(columns = ['team_x', 'team_1', 'team_2'], inplace = True)
    return df_merge

In [88]:
df_merge = combine_traded_player_team_data(df, df_team, df_merge)

In [92]:
df_merge

Unnamed: 0,name,mpg,fgm_per_g,fga_per_g,fg_pct,fg3m_per_g,fg3a_per_g,fg3_pct,fg2m_per_g,fg2a_per_g,...,positions_Point Guard,positions_Power Forward,positions_Shooting Guard,positions_Small Forward,wins,losses,win_loss_pct,team_pts_per_g,opp_pts_per_g,team_simple_rating_system
0,Precious Achiuwa,20.7,3.6,7.3,0.485,0.5,2.0,0.269,3.0,5.4,...,0,0,0,0,41,41,.500,112.9,111.4,1.59
1,OG Anunoby,35.6,6.3,13.2,0.476,2.1,5.5,0.387,4.2,7.7,...,0,0,0,1,41,41,.500,112.9,111.4,1.59
2,Dalano Banton,9.0,1.8,4.2,0.423,0.5,1.6,0.294,1.3,2.5,...,1,0,0,0,41,41,.500,112.9,111.4,1.59
3,Scottie Barnes,34.8,6.0,13.2,0.456,0.8,2.9,0.281,5.2,10.3,...,0,0,0,1,41,41,.500,112.9,111.4,1.59
4,Khem Birch,8.1,1.0,1.6,0.594,0.1,0.1,0.500,0.9,1.5,...,0,0,0,0,41,41,.500,112.9,111.4,1.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Will Barton,17.7,2.5,6.5,0.379,1.2,3.2,0.367,1.3,3.3,...,0,0,1,0,38.0,44.0,0.4635,113.05,112.9,0.265
66,Josh Richardson,23.5,3.7,8.5,0.431,1.6,4.5,0.365,2.0,4.0,...,0,0,1,0,32.0,50.0,0.39,113.7,117.8,-4.095
67,Kessler Edwards,10.7,1.0,2.6,0.387,0.5,1.5,0.309,0.5,1.1,...,0,0,0,1,46.5,35.5,0.567,117.05,115.3,1.665
68,Kevin Love,20.0,2.7,6.8,0.389,1.6,4.8,0.334,1.1,2.1,...,0,1,0,0,47.5,34.5,0.5795,110.9,108.35,2.55


# Scraping the all_nba Category

In [93]:
req = requests.get(f'https://www.basketball-reference.com/awards/all_league.html')
soup = BeautifulSoup(req.content, 'html.parser')
awards = []

# Extracting each player and their specific season
for row in soup.find('tbody').find_all('tr'):
    season = row.find('th', {'data-stat': 'season'})
    #need to make sure the season is there in order to get text and avoid errors
    if season:
        seasons = season.get_text()
    team = row.find('td', {'data-stat': 'all_team'})
    #need to make sure the team is there in order to get text and avoid errors
    if team:
        team_text = team.get_text()
        #was getting some blank values so needed to make sure there was something there otherwise it would be a null value that I can drop later
        if team_text:
            teams = team_text
     
    #the player were set up and counted from 1-15 so had to loop through it
    for i in range(1, 16):
        player = row.find('td', {'data-stat': str(i)})
        #same need to make sure the team is there in order to get text and avoid errors
        if player:
            player_found = player.find('a')
            if player_found:
                player_name = player_found.get_text()
                #need 
                player_dict = {
                    'season' : seasons,
                    'team' : teams,
                    'name' : player_name}
                #appending back to my list
                awards.append(player_dict)
#create a dataframe of list
df_all_nba = pd.DataFrame(awards)
df_all_nba.dropna(inplace = True)
df_all_nba = pd.get_dummies(df_all_nba, columns= ['team'], dtype = int)
df_all_nba

Unnamed: 0,season,name,team_1st,team_2nd,team_3rd
0,2023-24,Nikola Jokić,1,0,0
1,2023-24,Giannis Antetokounmpo,1,0,0
2,2023-24,Jayson Tatum,1,0,0
3,2023-24,Luka Dončić,1,0,0
4,2023-24,Shai Gilgeous-Alexander,1,0,0
...,...,...,...,...,...
1045,1946-47,Frankie Baumholtz,0,1,0
1046,1946-47,Ernie Calverley,0,1,0
1047,1946-47,Chick Halbert,0,1,0
1048,1946-47,John Logan,0,1,0


In [94]:
def get_award(award_list):
    """
    Scrape basketball reference for player awards.

    This function takes a list of basketball awards and scrapes basketball-reference.com to retrieve the players who received those awards for each season.

    Parameters:
    -----------
    award_list : list
        A list of strings containing the names of awards.

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing information about the players who received the specified awards for each season. The DataFrame has columns for 'season' (season in which the award was received),
        'name' (name of the player receiving the award), and each award in `award_list`, where a value of 1 indicates that the player received the award for that season, and 0 indicates
        that they did not receive the award.

    Example:
    --------
    To scrape basketball-reference.com for MVP and Rookie of the Year awards:
    >>> awards_df = get_award(['mvp', 'roy'])
    """
    all_awards = pd.DataFrame()
    for award in award_list:
        req = requests.get(f'https://www.basketball-reference.com/awards/{award}.html')
        soup = BeautifulSoup(req.content, 'html.parser')
        awards = []
    
        # Extracting each player and their specific season
        for row in soup.find('tbody').find_all('tr'):
            award_dict = {}
            season = row.find('th', {'data-stat': 'season'})
            if season:
                award_dict['season'] = season.get_text()

            player = row.find('td', {'data-stat': 'player'})
            if player:
                player_found = player.find('a')
                if player_found:
                    award_dict['name'] = player_found.getText()
            #appending back to my list
            awards.append(award_dict)
        #create a dataframe of list
        df = pd.DataFrame(awards)
        df.reset_index(drop=True, inplace=True)
        #adding a binary column to match award
        df[award.upper()] = 1
        all_awards = pd.concat([all_awards, df], ignore_index=True)
        all_awards.fillna(0, inplace = True)
        df[award.upper()].astype(int)
    return all_awards

In [95]:
#get awards dataframe
awards = ['mvp', 'dpoy', 'smoy', 'mip']
df = get_award(awards)

In [96]:
#merge all_nba and awards dataframes
all_awards = pd.merge(df, df_all_nba, how = 'outer', on = ['season', 'name']).fillna(0)
all_awards

Unnamed: 0,season,name,MVP,DPOY,SMOY,MIP,team_1st,team_2nd,team_3rd
0,2023-24,Nikola Jokić,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2022-23,Joel Embiid,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2021-22,Nikola Jokić,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2020-21,Nikola Jokić,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2019-20,Giannis Antetokounmpo,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1132,1946-47,Frankie Baumholtz,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1133,1946-47,Ernie Calverley,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1134,1946-47,Chick Halbert,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1135,1946-47,John Logan,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Scrape Per Year

In [264]:
def scrape_year(year, award_list):
    """
    Scrape and combine basketball player statistics and awards for a specific year.

    This function scrapes and combines player statistics and awards for a specific year, including total stats, advanced stats, and per game stats. It then merges these dataframes and saves the result to a CSV file.

    Parameters:
    -----------
    year : int
        The ending year of the season for which to retrieve the data to be scraped and combined.
    award_list : list of str
        A list of strings containing the names of awards.

    Returns:
    --------
    pandas.DataFrame
        The combined dataframe containing player statistics and awards for the specified year.

    Example:
    --------
    To scrape and combine player stats and the MVP and Rookie of the Year awards for the 2022-2023 season:
    >>> scrape_year(2023, ['mvp', 'roy'])
    """
    #scrape and clean total stats
    get_player_total_season_stats(year)
    df_total = pd.read_csv(f"./total_stats/{year-1}_{year}_player_season_totals.csv")
    df_total = combine_traded_player_total_stats(df_total)
    df_total = clean_total_stats(df_total)
    #scrape and clean advanced stats
    get_player_advanced_season_stats(year)
    df_advanced = pd.read_csv(f"./advanced_season_stat_total/{year-1}_{year}_advanced_player_season_totals.csv")
    df_advanced = combine_traded_player_advanced_stats(df_advanced)
    #scrape and clean per year stats
    get_per_game_stat(year)
    df_per = pd.read_csv(f'./per_game_stat/per_game_stat_{year-1}_{year}.csv')
    df_per = combine_traded_player_per_game_stats(df_per)
    
    #combine the total and advanced stats dataframes
    df = pd.merge(df_advanced, df_total, on = 'name')
    #combine the big dataframe and per year stats dataframes
    df = pd.merge(df_per, df, on = 'name')
    df = pd.get_dummies(df, columns=['positions'], dtype = int)
    #Combine team stats with big dataframe
    df_team = scrape_team_data(year)
    df_merge = pd.merge(df.dropna(subset=['team']), df_team, on='team', how = 'outer')
    df = combine_traded_player_team_data(df, df_team, df_merge)
    
    
    #adding season column for merge purposes with awards
    df['season'] = f'{year-1}-{str(year)[-2:]}'
    
    #for award in award_list:
    #scrape awards
     #   accolade = get_award(award_list)
   # all_awards = pd.merge(accolade, df_all_nba, how = 'outer', on = ['season', 'name']).fillna(0)
    #merge award and big df
   # df = pd.merge(df, all_awards, how = 'left', on = ['season', 'name'])
    df.fillna(0, inplace = True)    
    
    df.to_csv(f'./concat_df/total_{year-1}_{year}.csv', index = False)
    return df

In [266]:
scrape_year(1985, ['mvp', 'dpoy', 'smoy', 'mip'])

Unnamed: 0,name,mpg,fgm_per_g,fga_per_g,fg_pct,fg3m_per_g,fg3a_per_g,fg3_pct,fg2m_per_g,fg2a_per_g,...,positions_Power Forward,positions_Shooting Guard,positions_Small Forward,wins,losses,win_loss_pct,team_pts_per_g,opp_pts_per_g,team_simple_rating_system,season
0,Kareem Abdul-Jabbar,33.3,9.2,15.3,0.599,0.0,0.0,0.000,9.2,15.3,...,0,0,0,62,20,.756,118.2,110.9,6.48,1984-85
1,Michael Cooper,26.7,3.4,7.2,0.465,0.4,1.5,0.285,2.9,5.7,...,0,1,0,62,20,.756,118.2,110.9,6.48,1984-85
2,Magic Johnson,36.1,6.5,11.7,0.561,0.1,0.5,0.189,6.5,11.2,...,0,0,0,62,20,.756,118.2,110.9,6.48,1984-85
3,Earl Jones,3.5,0.0,0.5,0.000,0.0,0.0,0.000,0.0,0.5,...,0,0,0,62,20,.756,118.2,110.9,6.48,1984-85
4,Mitch Kupchak,12.3,2.1,4.2,0.504,0.0,0.0,0.000,2.1,4.2,...,1,0,0,62,20,.756,118.2,110.9,6.48,1984-85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,Kenny Natt,3.6,0.3,0.8,0.333,0.0,0.0,0.000,0.3,0.8,...,0,1,0,36.0,46.0,0.439,111.9,113.3,-1.52,1984-85
16,Ron Brewer,16.3,3.1,5.9,0.525,0.0,0.1,0.000,3.1,5.8,...,0,1,0,41.5,40.5,0.506,112.15,111.55,0.635,1984-85
17,Michael Wilson,14.1,1.9,4.1,0.468,0.0,0.0,0.000,1.9,4.1,...,0,1,0,39.0,43.0,0.4755,109.05,110.25,-0.815,1984-85
18,Larry Micheaux,9.9,1.6,2.8,0.580,0.0,0.1,0.000,1.6,2.7,...,1,0,0,53.5,28.5,0.6525,111.05,106.75,4.035,1984-85


Not running the below cell again as I have already scraped the data needed.

In [267]:
#scraping every season since 1979-1980 as that was the first year of the three point line. need to make the sleep time 60 because basketball reference has a strict per minute scrape policy
for i in range(1980, 2024):
    award_list = ['mvp', 'dpoy', 'smoy', 'mip']
    scrape_year(i, award_list)
    time.sleep(60)