In [3]:
'''
We scrape from two different sources for the historical date: 
    Basketball Reference: 1979-80 to 1996-97
    Official NBA website: 1997-98 to 2023-24

There are a couple reasons why we split up the sources for this data. The main reason is to deal with timeouts when it came to scraping. 
Splitting the data between two sources decreases the likelihood that we would be timed out while testing the scraping scripts. 

The reason we landed on these specific years for the two sources has to do with the NBA's stats in particular. Unfortunately, 
the NBA does not track the general stats for players prior to the 1996-97 season. Therefore, we use basketball-reference for every season before that,
limited by the season in which the 3PT shot was introduced (1979-80)

Note: these scripts were ran using a Jupyter Notebook, but for ease-of-access the scripts have been put in this python file. 

'''

# scraping basketball reference (1979-80 to 1996-97)
import pandas as pd 

# we define the range of years that we will scrape,  
start_season_br = 1979 
end_season_br = 1996 

# create an empty list to store the dataframes of each season
all_seasons_data = []

# funciton responsible for scraping the data from basketball-reference
def fetch_basketball_reference(year):
    
    # we initialized the start and end seasons as the leading year values for each season, so we need to do some formatting 
    season = f"{year}-{str(year + 1)[-2:]}"     # formatted as YYYY-YY
    
    # print a message to indicate that the stats for the season are being scraped
    print(f"Fetching data from Basketball-Reference for season: {season}")
    
    # try-except loop for fetching data
    try: 
        # URL for the season's player stats
        url = f"https://www.basketball-reference.com/leagues/NBA_{year + 1}_per_game.html"
        
        # read the html table into a dataframe 
        df = pd.read_html(url)[0]
        
        df = df[df['Rk'].ne('Rk')]          # remove the header rows from the table 
        df = df.dropna(subset=['Player'])   # remove rows with missing player names 
        
        # the table has both 'G', which is games played, and 'GS', which is games started. four our purposes, we do not need the games started value 
        if 'GS' in df.columns:
            df = df.drop(columns=['GS'])
            
        # drop the position column so that it matches the format of the nba_api data
        if 'Pos' in df.columns:
            df = df.drop(columns=['Pos'])
        
        # add a column for the year value, corresponding to the latter half of the season (i.e. a player's 1979-80 season stats will have 1980)
        df["YR"] = year + 1 
        
        return df
    
    except Exception as e:
        # print an error message if data cannot be fetched 
        print(f"Error fetching data for {season}: {e}")
        return None 
    
# loop through each season within the range of 1979-80 to 1995-96
for year in range(start_season_br, end_season_br + 1):
    df = fetch_basketball_reference(year)
    if df is not None:
        all_seasons_data.append(df)

# check to see if the data was fetched 
if not all_seasons_data:
    print("No data fetched for any season.")
else:
    # combine all seasons into one dataframe 
    combined_df = pd.concat(all_seasons_data, ignore_index=True)
    
    # rename the columns to match our database
    column_mapping = {
        "Player": "Player",
        "Age": "Age",
        "Team": "TEAM",
        "G": "GS",     
        "MP": "MP",
        "FG": "FG",
        "FGA": "FGA",
        "FG%": "FG%",
        "3P": "3P",
        "3PA": "3PA",
        "3P%": "3P%",
        "FT": "FT",
        "FTA": "FTA",
        "FT%": "FT%",
        "ORB": "ORB",
        "DRB": "DRB",
        "TRB": "TRB",
        "AST": "AST",
        "STL": "STL",
        "BLK": "BLK",
        "TOV": "TOV",
        "PF": "PF",
        "PTS": "PTS",
    }
    combined_df.rename(columns=column_mapping, inplace=True)
    
    # add a rank column 
    combined_df['Rk'] = range(1, len(combined_df) + 1)
    
    # reorder the columns to match the schema of the database
    combined_df = combined_df[[
        "Rk", "Player", "Age", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%",
        "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL",
        "BLK", "TOV", "PF", "PTS", "YR", "TEAM"
    ]]

    # display the first few rows
    print(combined_df.head())

    # save to CSV
    combined_df.to_csv("nba_player_stats_basketball_reference_1979_to_1996.csv", index=False)
    

Fetching data from Basketball-Reference for season: 1979-80
Fetching data from Basketball-Reference for season: 1980-81
Fetching data from Basketball-Reference for season: 1981-82
Fetching data from Basketball-Reference for season: 1982-83
Fetching data from Basketball-Reference for season: 1983-84
Fetching data from Basketball-Reference for season: 1984-85
Fetching data from Basketball-Reference for season: 1985-86
Fetching data from Basketball-Reference for season: 1986-87
Fetching data from Basketball-Reference for season: 1987-88
Fetching data from Basketball-Reference for season: 1988-89
Fetching data from Basketball-Reference for season: 1989-90
Fetching data from Basketball-Reference for season: 1990-91
Fetching data from Basketball-Reference for season: 1991-92
Fetching data from Basketball-Reference for season: 1992-93
Fetching data from Basketball-Reference for season: 1993-94
Fetching data from Basketball-Reference for season: 1994-95
Fetching data from Basketball-Reference 

In [5]:
'''
We scrape from two different sources for the historical date: 
    Basketball Reference: 1979-80 to 1995-96
    Official NBA website: 1996-97 to 2023-24

There are a couple reasons why we split up the sources for this data. The main reason is to deal with timeouts when it came to scraping. 
Splitting the data between two sources decreases the likelihood that we would be timed out while testing the scraping scripts. 

The reason we landed on these specific years for the two sources has to do with the NBA's stats in particular. Unfortunately, 
the NBA does not track the general stats for players prior to the 1996-97 season. Therefore, we use basketball-reference for every season before that,
limited by the season in which the 3PT shot was introduced (1979-80)

Note: these scripts were ran using a Jupyter Notebook, but for ease-of-access the scripts have been put in this python file. 

'''

# scraping NBA using the nba_api libary (1996-97 to 2023-24)
from nba_api.stats.endpoints import leaguedashplayerstats
import pandas as pd
import time
from requests.exceptions import ReadTimeout

# we define the range of years that we will scrape,  
start_season_nba = 1997  
end_season_nba = 2023    

# create an empty list to store the dataframes of each season
all_seasons_data = []

# funciton responsible for scraping the data with the nba_api. set retries and delays to prevent timeouts
def fetch_nba_api_data(year, retries=3, delay=5):
    # we initialized the start and end seasons as the leading year values for each season, so we need to do some formatting 
    season = f"{year}-{str(year + 1)[-2:]}"  # Format: YYYY-YY (e.g., 1996-97)
    # print a message to indicate that the stats for the season are being scraped
    print(f"Fetching data from NBA API for season: {season}")
    
    
    for attempt in range(retries):
    # try-except loop for fetching data
        try:
            # fetch player stats for the season
            player_stats = leaguedashplayerstats.LeagueDashPlayerStats(
                season=season,
                season_type_all_star="Regular Season",  # Options: Regular Season, Playoffs, Pre Season, All Star
                per_mode_detailed="PerGame",  # Options: PerGame, Totals, Per36, etc.
                timeout=60  # Increase timeout to 60 seconds
            )

            # convert the data to a DataFrame
            df = player_stats.get_data_frames()[0]

           # add a column for the year value, corresponding to the latter half of the season (i.e. a player's 1979-80 season stats will have 1980)
            df["YR"] = year + 1  

            return df

        except ReadTimeout as e:
            print(f"Attempt {attempt + 1} failed for season {season}: {e}")
            if attempt < retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)  # Wait before retrying
            else:
                print(f"Max retries reached for season {season}. Skipping...")
                return None
        except Exception as e:
            # print an error message if data cannot be fetched 
            print(f"Error fetching data for season {season}: {e}")
            return None
    
# loop through each season for NBA API
for year in range(start_season_nba, end_season_nba + 1):
    df = fetch_nba_api_data(year)
    if df is not None:
        all_seasons_data.append(df)
    time.sleep(2)  # add a delay between requests to avoid overwhelming the server

# check to see if the data was fetched 
if not all_seasons_data:
    print("No data fetched for any season.")
else:
    # combine all seasons into one dataframe 
    combined_df = pd.concat(all_seasons_data, ignore_index=True)
    
    # rename the columns to match our database
    column_mapping = {
        "PLAYER_NAME": "Player",
        "AGE": "Age",
        "TEAM_ABBREVIATION": "TEAM",
        "GP": "GS",
        "MIN": "MP",
        "FGM": "FG",
        "FGA": "FGA",
        "FG_PCT": "FG%",
        "FG3M": "3P",
        "FG3A": "3PA",
        "FG3_PCT": "3P%",
        "FTM": "FT",
        "FTA": "FTA",
        "FT_PCT": "FT%",
        "OREB": "ORB",
        "DREB": "DRB",
        "REB": "TRB",
        "AST": "AST",
        "STL": "STL",
        "BLK": "BLK",
        "TOV": "TOV",
        "PF": "PF",
        "PTS": "PTS",
    }
    combined_df.rename(columns=column_mapping, inplace=True)
    
    # add a rank column 
    combined_df['Rk'] = range(1, len(combined_df) + 1)
    
    # reorder the columns to match the schema of the database
    combined_df = combined_df[[
        "Rk", "Player", "Age", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%",
        "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL",
        "BLK", "TOV", "PF", "PTS", "YR", "TEAM"
    ]]

    # display the first few rows
    print(combined_df.head())

    # save to CSV
    combined_df.to_csv("nba_player_stats_nba_api_1997_to_2023.csv", index=False)
    

Fetching data from NBA API for season: 1997-98
Fetching data from NBA API for season: 1998-99
Fetching data from NBA API for season: 1999-00
Fetching data from NBA API for season: 2000-01
Fetching data from NBA API for season: 2001-02
Fetching data from NBA API for season: 2002-03
Fetching data from NBA API for season: 2003-04
Fetching data from NBA API for season: 2004-05
Fetching data from NBA API for season: 2005-06
Fetching data from NBA API for season: 2006-07
Fetching data from NBA API for season: 2007-08
Fetching data from NBA API for season: 2008-09
Fetching data from NBA API for season: 2009-10
Fetching data from NBA API for season: 2010-11
Fetching data from NBA API for season: 2011-12
Fetching data from NBA API for season: 2012-13
Fetching data from NBA API for season: 2013-14
Fetching data from NBA API for season: 2014-15
Fetching data from NBA API for season: 2015-16
Fetching data from NBA API for season: 2016-17
Fetching data from NBA API for season: 2017-18
Fetching data

In [7]:
import pandas as pd

# load the two CSV files
df_br = pd.read_csv("nba_player_stats_basketball_reference_1979_to_1996.csv")
df_nba = pd.read_csv("nba_player_stats_nba_api_1997_to_2023.csv")

# combine the DataFrames
combined_df = pd.concat([df_br, df_nba], ignore_index=True)

# save the combined DataFrame to a new CSV file
combined_df.to_csv("nba_player_stats_combined_1979_to_2023.csv", index=False)
print(combined_df.head())

   Rk          Player   Age    GS    MP    FG  ...  BLK  TOV   PF   PTS    YR  TEAM
0   1   George Gervin  27.0  78.0  37.6  13.1  ...  1.0  3.3  2.7  33.1  1980   SAS
1   2   World B. Free  26.0  68.0  38.0  10.8  ...  0.5  3.4  2.9  30.2  1980   SDC
2   3  Adrian Dantley  24.0  68.0  39.3  10.7  ...  0.2  3.4  3.1  28.0  1980   UTA
3   4   Julius Erving  29.0  78.0  36.1  10.7  ...  1.8  3.6  2.7  26.9  1980   PHI
4   5    Moses Malone  24.0  82.0  38.3   9.5  ...  1.3  3.7  2.6  25.8  1980   HOU

[5 rows x 25 columns]
