<h1><center>NBA Game Attendance Prediction Main Data Collection</center></h1>

This is a separate data collection notebook for the NBA Game Attendance Prediction project. This was done separately so that the large amount of data would only have to be collected once.

In [None]:
#analytics stack
import pandas as pd
from pandasql import sqldf
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import datetime 
import geopy.distance

#nba_api package
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import leaguegamelog, boxscoresummaryv2

#meteostat weather package
from meteostat import Point, Daily, units

### NBA Box Score and Attendance Data Collection

In [None]:
#functions to change column names in game_data
def col_home(df):
    df.columns = [x + '_HOME' for x in df.columns]
    return df

def col_away(df):
    df.columns = [x + '_AWAY' for x in df.columns]
    return df

#function to get and transform game data by season
def game_data(seasons):
    #get game data from nba api
    games = leaguegamelog.LeagueGameLog(season=seasons, timeout = 100).get_data_frames()[0]
    games['TEAM_ID'] = games['TEAM_ID'].astype(str)
    
    #store season_id, game_id and game_dates in separate lists
    season_ids = games['SEASON_ID'].values[0]
    game_ids = games['GAME_ID'].unique()
    game_dates = [games['GAME_DATE'][i] for i in range(0,len(games),2)]
    
    #create new df to store revised structure
    games_s = games.drop(['GAME_ID', 'SEASON_ID', 'GAME_DATE'], axis=1)
    data = []
    df = pd.DataFrame(data)
    for i in range(0,len(games_s),2): #rename home and away columns in each row
        if "vs" in games_s['MATCHUP'].values[i]:
            home_team = col_home(games_s.iloc[[i]]).reset_index(drop = True)
            away_team = col_away(games_s.iloc[[i+1]]).reset_index(drop = True)
        else:
            home_team = col_home(games_s.iloc[[i+1]]).reset_index(drop = True)
            away_team = col_away(games_s.iloc[[i]]) .reset_index(drop = True)
        df1 = pd.concat([home_team, away_team], axis=1) # combine the renamed rows into one
        df = pd.concat([df, df1])
    cols = list(df.columns.values)
    cols = ['GAME_ID', 'SEASON_ID', 'GAME_DATE'] + cols
    
    #reassign game_id, season_id, game_date columns
    df['GAME_ID'] = game_ids
    df['SEASON_ID'] = season_ids
    df['GAME_DATE'] = game_dates
    df = df[cols]
    
    return df

#### 2022-2023 Season

In [None]:
games_2022 = game_data(2022)

In [None]:
missed_games = []
attendance_dfs = []
i = 1
for game_id in games_2022['GAME_ID']:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        attendance_dfs.append(b)
    except:
        # track games that returned no response object
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass

# check if missed games were due to timeout or faulty data
print(f'DOING {len(missed_games)} MISSED GAMES')
i=1
missed_dfs = []
for game_id in missed_games:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        missed_dfs.append(b)
    except:
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass

missed_games_2022 = pd.concat(missed_dfs)
attendance_2022 = pd.concat(attendance_dfs)

In [None]:
final_2022 = pd.concat([attendance_2022, missed_games_2022])
final_2022['GAME_DATE'] = pd.to_datetime(final_2022['GAME_DATE'])
final_2022 = final_2022.sort_values('GAME_DATE')
final_2022.to_csv('data/attendance_2022.csv')

#### 2021-2022 Season

In [None]:
games_2021 = game_data(2021)

In [None]:
missed_games = []
attendance_dfs = []
i = 1
for game_id in games_2021['GAME_ID']:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        attendance_dfs.append(b)
    except:
        # track games that returned no response object
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass

# check if missed games were due to timeout or faulty data
print(f'DOING {len(missed_games)} MISSED GAMES')
i=1
missed_dfs = []
for game_id in missed_games:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        missed_dfs.append(b)
    except:
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass

missed_games_2021 = pd.concat(missed_dfs)
attendance_2021 = pd.concat(attendance_dfs)

In [None]:
final_2021 = pd.concat([attendance_2021, missed_games_2021])
final_2021['GAME_DATE'] = pd.to_datetime(final_2021['GAME_DATE'])
final_2021 = final_2021.sort_values('GAME_DATE')
final_2021 = final_2021.merge(games_2021, how='left', on='GAME_ID')
final_2021.to_csv('data/attendance_2021.csv')

#### 2019-2020 Season

In [None]:
games_2019 = game_data(2019)

In [None]:
# account for Covid Bubble games with no attendance
games_2019 = games_2019[games_2019['GAME_DATE'] < '2020-03-12']
missed_games = []
attendance_dfs = []

i = 1
for game_id in games_2019['GAME_ID']:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        attendance_dfs.append(b)
    except:
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass


print(f'DOING {len(missed_games)} MISSED GAMES')
i=1
missed_dfs = []
for game_id in missed_games:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        missed_dfs.append(b)
    except:
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass
attendance_2019 = pd.concat(attendance_dfs)
missed_games_2019 = pd.concat(missed_dfs)

In [None]:
final_2019 = pd.concat([attendance_2019, missed_games_2019])
final_2019['GAME_DATE'] = pd.to_datetime(final_2019['GAME_DATE'])
final_2019 = final_2019.sort_values('GAME_DATE')
final_2019 = final_2019.merge(games_2019, how='left', on='GAME_ID')
final_2019.to_csv('data/attendance_2019.csv')

#### 2018-2019 Season

In [None]:
games_2018 = game_data(2018)

In [None]:
missed_games = []
attendance_dfs = []

i = 1
for game_id in games_2018['GAME_ID']:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        attendance_dfs.append(b)
    except:
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass


print(f'DOING {len(missed_games)} MISSED GAMES')
i=1
missed_dfs = []
for game_id in missed_games:
    try: 
        time.sleep(.75) #prevent connection time-out
        b = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id, timeout = 100).get_data_frames()[4]
        b['GAME_ID'] = game_id
        missed_dfs.append(b)
    except:
        print(f'passed {i}, id: {game_id}')
        missed_games.append(game_id)
        i += 1
        pass
attendance_2018 = pd.concat(attendance_dfs)
missed_games_2018 = pd.concat(missed_dfs)

In [None]:
final_2018 = pd.concat([attendance_2018, missed_games_2018])
final_2018['GAME_DATE'] = pd.to_datetime(final_2018['GAME_DATE'])
final_2018 = final_2018.sort_values('GAME_DATE')
final_2018 = final_2018.merge(games_2018, how='left', on='GAME_ID')
final_2018.to_csv('data/attendance_2018.csv')

#### American Census Survey Data

In [None]:
# read in dataset containing arena city and FIPS code
state_fips = pd.read_csv('data/state_fips.csv')
state_fips['State'] = state_fips['State'].astype(str).str.zfill(2)
state_fips['County'] = state_fips['County'].astype(str).str.zfill(3)

In [None]:
# function to convert json output to dataframe for clean output
def json_to_dataframe(response):
    return pd.DataFrame(response.json()[1:], columns=col_names)

census_years = ['2021','2020','2019']
seasons = [2023,2022,2020]
census_dfs = []
for x in range(3):
    for i in range(len(state_fips)):
        base_url = f'http://api.census.gov/data/{census_years[x]}/acs/acs5'
        # Specify Census variables and other predicates
        get_vars = ["NAME",  # variable codes go here and below
                    "B01003_001E", # total population
                    "B23025_002E", # total pop in education data
                    "B23025_005E", # total pop < high school education
                    "B06009_003E", # Total pop high school graduate
                    "B06009_004E", # Total pop w/ some college or associates
                    "B06009_005E", # Total pop w/ bachelors
                    "B06009_006E", # Total pop w/ grad degree
                    "B13002_001E", # Total women 15 TO 50y who birthed in last 12m'
                    'B19001_001E', # Total number of households in household income data'
                    'B19001_002E', # household income bracket variables start here
                    'B19001_003E',
                    'B19001_004E',
                    'B19001_005E',
                    'B19001_006E',
                    'B19001_007E',
                    'B19001_008E', 
                    'B19001_009E',
                    'B19001_010E',
                    'B19001_011E',
                    'B19001_012E',
                    'B19001_013E',
                    'B19001_014E',
                    'B19001_015E',
                    'B19001_016E',
                    'B19001_017E',
                    'B25064_001E', # median gross rent 
                    'B17001_001E', # total for poverty level
                    'B17001_002E' # total under poverty level
        ]
        predicates = {}
        predicates["get"] = ",".join(get_vars)
        predicates["for"] = f"county:{str(state_fips.loc[i,'County'])}" # select all tracts
        predicates["in"] = f"state:{str(state_fips.loc[i,'State'])}" # list the relevant states of interest (no space after commas)

        # Execute the request, examine text of response object
        r = requests.get(base_url, params=predicates)
        col_names = ['Name', 
                 'Total Population',
                 'Total labor force',
                 'Total Unemployed',
                 'Total pop high school graduate',
                 'Total pop w/ some college or associates',
                 'Total pop w/ bachelors',
                 'Total pop w/ grad degree',
                 'Total women 15 TO 50y who birthed in last 12m',
                 'Total number of households in household income data',
                 'Total with household income <$10k', # all household income data for past 12m
                 'Total with household income $10-15k',
                 'Total with household income $15-20k',
                 'Total with household income $20-25k',
                 'Total with household income $25-30k',
                 'Total with household income $30-35k',
                 'Total with household income $35-40k',
                 'Total with household income $40-45k',
                 'Total with household income $45-50k',
                 'Total with household income $50-60k',
                 'Total with household income $60-75k',
                 'Total with household income $75-100k',
                 'Total with household income $100-125k',
                 'Total with household income $125-150k',
                 'Total with household income $150-200k',
                 'Total with household income $200k+',
                 'Median gross rent',
                 'Total for poverty level',
                 'Total under poverty level',
                 'state',
                 'county']

        df_census = json_to_dataframe(r)

        for c in col_names: # set the columns from type string to type int
          if c == "state" or "county" or "tract":
            continue
          if c != "Name":
            df_census[c] = df_census[c].astype(float)

        df_census['Season'] = seasons[x]
        team = state_fips.loc[i,'Team Code']
        df_census['Team'] = team
        census_dfs.append(df_census)
    print('finished season ' + str(x))

In [None]:
final_census_df = pd.concat(census_dfs)
final_census_df.reset_index(inplace = True, drop = True)
final_census_df.to_csv('data/census_data.csv', index = False)