# Scraping all team stats tables

In the previous notebook "Accumulating all games since 2004-2005 season, we gathered the dates and matchup IDs for all NBA games (regular and postseason) since the 2004-2005 season. I will build upon this in the notebook by gathering the team stats for all of these games (at least the games since the 2009-2010 season) in a DataFrame. I will then save this DataFrame for future use. This will employ functions written in the notebook "Scraping a season- May24". 

We begin by importing the necessary libraries.

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime #find date of games
import time #keep track of lengths of program run time

In [2]:
def teamstats(matchup_id):
    '''
    Extract a table of team stats from a game with a matchup ID and store the stats in a DataFrame.
    
    Input:
    url to page with team stats table
    
    Output:
    DataFrame (2 records x 33 columns)
        One record each for visistor team and home team
    '''
    
    url = 'http://www.espn.com/nba/matchup?gameId={0}'.format(str(matchup_id))

    
    page = requests.get(url)
    html = page.content
    
    soup = BeautifulSoup(html, 'lxml')
    tables = soup.find_all('table')
    
    tb0 = tables[0].tbody #table with team names, points scored each quarter, total points
    tb1 = tables[1].tbody #table with traditional team stats (assists, total rebounds, etc.)
    
    [visitor_team_row, home_team_row] = [row for row in tb0.find_all('tr')]
    #lists of team name, points scored each quarter, total points    
    visitor_team_name_points = [val.contents[0].strip() for val in visitor_team_row.find_all('td')]
    home_team_name_points = [val.contents[0].strip() for val in home_team_row.find_all('td')]
    
    #Handling the case that the game went to overtime
    if len(visitor_team_name_points) > 6: #it was an overtime game
        num_overtimes = len(visitor_team_name_points) - 6 #number of overtime periods
        #List of int's for visitor and home teams of points scored per overtime period
        visitor_overtime_points = list(map(int,visitor_team_name_points[5:5+num_overtimes]))
        home_overtime_points = list(map(int,home_team_name_points[5:5+num_overtimes]))
        #then remove these items from lists of team name and point totals
        for period in range(num_overtimes):
            visitor_team_name_points.pop(5 + period)
            home_team_name_points.pop(5 + period)
    else: #no overtime
        num_overtimes = 0
        visitor_overtime_points = []
        home_overtime_points = []

    #create 3 lists for Team Stats table
    #List 1: names of stats in Team Stats table
    #List 2: corresponding visitor team's stat
    #List 3: corresponding home team's stat
    tb1_stat_names, tbl_visitor_stats, tb1_home_stats = [], [], []
    
    #cycle over different stats
    for row in tb1.find_all('tr'):
        
        tdx = [val for val in row.find_all('td')]
        
        tb1_stat_names += tdx[0].contents[0].strip().split('-')
        tbl_visitor_stats += tdx[1].contents[0].strip().split('-')
        tb1_home_stats += tdx[2].contents[0].strip().split('-')
    
    
    #precede each stat 'Attempted' with type of shot attempted
    tb1_stat_names[1] = 'fg_attempted'
    tb1_stat_names[4] = 'threept_attempted'
    tb1_stat_names[7] = 'ft_attempted'
    
    tb0_stat_names = ['team', 'first_qtr_points', 'second_qtr_points', \
                  'third_qtr_points', 'fourth_qtr_points', 'total_points']
    
    #names of all stats, including team name, rebounds, etc.
    stat_names = tb0_stat_names + tb1_stat_names + ['number_of_ot_periods', 'ot_points']
    #corresponding stats for teams
    visitor_stats = visitor_team_name_points + tbl_visitor_stats + [num_overtimes, visitor_overtime_points]
    home_stats = home_team_name_points + tb1_home_stats + [num_overtimes, home_overtime_points]
    
    #create DataFrame of all stats (all entries will be type string (why??))
    stats_df = pd.DataFrame(columns=stat_names)
    stats_df.loc[0] = visitor_stats
    stats_df.loc[1] = home_stats
    
    #append column of which team won (1 if won, 0 if lost)
    if int(stats_df.loc[0,'total_points']) > int(stats_df.loc[1,'total_points']):
        stats_df.loc[:,'won'] = pd.Series([1,0])
    else:
        stats_df.loc[:,'won'] = pd.Series([0,1])
        
    stats_df['away_or_home'] = pd.Series(['Away', 'Home'])
        
    
    #convert all entries from string type to (int or float) type (except for Team Name)
    #column_names = list(stats_df.columns)
    
    for stat in stat_names:
        if (stat == 'team') or (stat == 'ot_points'):
            pass
        elif '%' in stat: #convert percentage stats to float type
            stats_df[stat] = stats_df[stat].apply(lambda num: float(num))
        else: #convert other stats to int type
            stats_df[stat] = stats_df[stat].apply(lambda num: int(num))
            
    stats_df['matchup_id'] = pd.Series([matchup_id,matchup_id])
    
            
    return stats_df

We now import the CSV written that contains data for all of the NBA games played since the 2004-2005 season.

In [3]:
all_games_played = pd.read_csv('all_games_04_on.csv', index_col='Unnamed: 0')

print(all_games_played.shape)

all_games_played.head()

(36140, 9)


Unnamed: 0,team,season_start_year,season_end_year,season_type,game_month,game_day,game_year,game_date,matchup_id
0,bos,2004,2005,regular,11,3,2004,11/3/2004,241103002
1,bos,2004,2005,regular,11,5,2004,11/5/2004,241105002
2,bos,2004,2005,regular,11,6,2004,11/6/2004,241106018
3,bos,2004,2005,regular,11,10,2004,11/10/2004,241110002
4,bos,2004,2005,regular,11,12,2004,11/12/2004,241112002


In [4]:
#divide large DataFrame into smaller DataFrames by year
all_games_played_by_year = {}
for year in range(2005, 2019):
    all_games_played_by_year[year] = all_games_played[all_games_played['season_end_year']==year]

In [5]:
#gather team abbreviations, list in alphabetical order

abbrevs = list(set(all_games_played_by_year[2018].loc[:,'team']))

abbrevs.sort()

print(abbrevs)

['atl', 'bkn', 'bos', 'cha', 'chi', 'cle', 'dal', 'den', 'det', 'gs', 'hou', 'ind', 'lac', 'lal', 'mem', 'mia', 'mil', 'min', 'no', 'ny', 'okc', 'orl', 'phi', 'phx', 'por', 'sa', 'sac', 'tor', 'utah', 'wsh']


In [6]:
all_matchup_id = {}

#list of distinct matchup ID's from 2018 season
all_matchup_id[2018] = list(set(all_games_played_by_year[2018].loc[:,'matchup_id']))

print("Original number of matchup ID's: " + str(all_games_played_by_year[2018].loc[:,'matchup_id'].shape[0]))
print("Number of distinct matchup ID's: " + str(len(all_matchup_id[2018])))

Original number of matchup ID's: 2624
Number of distinct matchup ID's: 1312


By only considering the number of distinct matchup ID's, we have cut down the number of matchup ID's in half. This is because we stored each matchup ID exactly twice when compiling all of the information on the games, once for the home team and once for the away team.

We will now compile team stats for all of the games played during the 2017-2018 season.

In [7]:
all_team_stats_by_season = {}

"\nstart_time = time.time()\n\nall_team_stats_by_season[2018] = pd.concat(teamstats(all_matchup_id[2018][idx]) for idx in range(len(all_matchup_id[2018])))\n\nprint('Took ' + str(time.time() -start_time) + ' seconds.')\n"

In [10]:
for year in range(2018,2004,-1):
    all_matchup_id[year] = list(set(all_games_played_by_year[year].loc[:,'matchup_id']))

In [12]:
for year in range(2018,2004,-1):
    print('Start of ' + str(year) + ' season.')
    start_time = time.time()
    all_team_stats_by_season[year] = pd.concat(teamstats(all_matchup_id[year][idx]) for idx in range(len(all_matchup_id[year])))
    all_team_stats_by_season[year].to_csv('all_games_' + str(year) + '.csv')
    print(str(year) + ' took ' + str(time.time() -start_time) + ' seconds.')

Start of 2007 season.


ValueError: cannot set a row with mismatched columns

We were able to successfully run the code until the 2008-2009 season. But no matter. We have the team stats table of all the games from the 2009-2010 season on. There are over 10,000 games among these seasons, which should be more than enough data to gain useful insights about the present state of basketball. It should be noted that each season took between 10-18 minutes to scrape.

We combine these in a single DataFrame and show the first 20 rows. 

In [18]:
all_team_stats_2009_to_2018 = pd.concat(pd.read_csv('all_games_' + str(year) + '.csv').loc[:,'team':]\
                                        for year in range(2009,2019))

print(all_team_stats_2009_to_2018.shape)

all_team_stats_2009_to_2018.head(20)

(25782, 33)


Unnamed: 0,team,first_qtr_points,second_qtr_points,third_qtr_points,fourth_qtr_points,total_points,FG Made,fg_attempted,Field Goal %,3PT Made,...,Fast Break Points,Points in Paint,Personal Fouls,Technical Fouls,Flagrant Fouls,number_of_ot_periods,ot_points,won,away_or_home,matchup_id
0,DAL,21,23,25,27,96,39,78,50.0,9,...,8,0,16,1,0,0,[],1,Away,281217028
1,TOR,33,16,14,23,86,36,84,42.9,4,...,8,0,14,0,0,0,[],0,Home,281217028
2,GS,26,19,17,26,88,36,90,40.0,5,...,10,38,21,1,0,0,[],0,Away,290304004
3,CHI,20,31,24,35,110,39,86,45.3,5,...,18,46,16,0,0,0,[],1,Home,290304004
4,MIL,20,19,19,15,73,28,75,37.3,4,...,10,24,23,0,0,0,[],0,Away,290304005
5,CLE,22,26,24,19,91,31,74,41.9,9,...,17,30,20,2,0,0,[],1,Home,290304005
6,SA,31,26,19,26,102,39,85,45.9,7,...,11,36,22,1,0,0,[],0,Away,290304006
7,DAL,29,27,27,24,107,37,69,53.6,9,...,20,28,19,0,0,0,[],1,Home,290304006
8,ORL,27,18,21,35,101,36,70,51.4,13,...,5,34,20,0,0,0,[],1,Away,290517002
9,BOS,17,21,23,21,82,29,74,39.2,4,...,17,30,22,0,0,0,[],0,Home,290517002


We note that not all of the column names are in lowercase underscore format, such as `FG Made` and `Field Goal %`. This will make it harder to run SQL on this DataFrame. We will conclude this notebook by changing all of the column names to this format and saving our DataFrame.

In [28]:
print(all_team_stats_2009_to_2018.columns.tolist())

['team', 'first_qtr_points', 'second_qtr_points', 'third_qtr_points', 'fourth_qtr_points', 'total_points', 'FG Made', 'fg_attempted', 'Field Goal %', '3PT Made', 'threept_attempted', 'Three Point %', 'FT Made', 'ft_attempted', 'Free Throw %', 'Total Rebounds', 'Offensive Rebounds', 'Defensive Rebounds', 'Assists', 'Steals', 'Blocks', 'Total Turnovers', 'Points Off Turnovers', 'Fast Break Points', 'Points in Paint', 'Personal Fouls', 'Technical Fouls', 'Flagrant Fouls', 'number_of_ot_periods', 'ot_points', 'won', 'away_or_home', 'matchup_id']


In [36]:
#changing all column names to lowercase with spaces replaced by underscores
new_column_names = ['team', 'first_qtr_points', 'second_qtr_points', 'third_qtr_points', 'fourth_qtr_points', \
                'total_points', 'fg_made', 'fg_attempted', 'fg_percentage', 'threept_made', 'threept_attempted', \
                'threept_percentage', 'ft_made', 'ft_attempted', 'ft_percentage', 'total_rebounds', 'offensive_rebounds', \
                'defensive_rebounds', 'assists', 'steals', 'blocks', 'total_turnovers', 'points_off_turnovers', \
                'fast_break_points', 'points_in_paint', 'personal_fouls', 'technical_fouls', 'flagrant_fouls', \
                'number_of_ot_periods', 'ot_points', 'won', 'away_or_home', 'matchup_id']
'''
column_names[column_names.index['FG Made']] = 'fg_made'
column_names[column_names.index['Field Goal %']] = 'fg_percentage'
column_names[column_names.index['3PT Made']] = 'threept_made'
column_names[column_names.index['Three Point %']] = 'threept_percentage'
column_names[column_names.index['FT Made']] = 'ft_made'
column_names[column_names.index['Free Throw %']] = 'ft_percentage'
column_names[column_names.index['Total Rebounds']] = 'total_rebounds'
column_names[column_names.index['Offensive Rebounds']] = 'offensive_rebounds'
column_names[column_names.index['Defensive Rebounds']] = 'defensive_rebounds'
column_names[column_names.index['Assists']] = 'assists'
column_names[column_names.index['Steals']] = 'steals'
column_names[column_names.index['Blocks']] = 'blocks'
column_names[column_names.index['Total Turnovers']] = 'total_turnovers'
column_names[column_names.index['Points Off Turnovers']] = 'points_off_turnovers'
column_names[column_names.index['Fast Break Points']] = 'fast_break_points'
column_names[column_names.index['Blocks']] = 'blocks'
column_names[column_names.index['Points in Paint']] = 'points_in_paint'
column_names[column_names.index['Personal Fouls']] = 'personal_fouls'
column_names[column_names.index['Technical Fouls']] = 'technical_fouls'
column_names[column_names.index['Flagrant Fouls']] = 'flagrant_fouls'
'''

#replace column names
all_team_stats_2009_to_2018.columns = new_column_names

#see DataFrame with new columns
all_team_stats_2009_to_2018.head(10)

Unnamed: 0,team,first_qtr_points,second_qtr_points,third_qtr_points,fourth_qtr_points,total_points,fg_made,fg_attempted,fg_percentage,threept_made,...,fast_break_points,points_in_paint,personal_fouls,technical_fouls,flagrant_fouls,number_of_ot_periods,ot_points,won,away_or_home,matchup_id
0,DAL,21,23,25,27,96,39,78,50.0,9,...,8,0,16,1,0,0,[],1,Away,281217028
1,TOR,33,16,14,23,86,36,84,42.9,4,...,8,0,14,0,0,0,[],0,Home,281217028
2,GS,26,19,17,26,88,36,90,40.0,5,...,10,38,21,1,0,0,[],0,Away,290304004
3,CHI,20,31,24,35,110,39,86,45.3,5,...,18,46,16,0,0,0,[],1,Home,290304004
4,MIL,20,19,19,15,73,28,75,37.3,4,...,10,24,23,0,0,0,[],0,Away,290304005
5,CLE,22,26,24,19,91,31,74,41.9,9,...,17,30,20,2,0,0,[],1,Home,290304005
6,SA,31,26,19,26,102,39,85,45.9,7,...,11,36,22,1,0,0,[],0,Away,290304006
7,DAL,29,27,27,24,107,37,69,53.6,9,...,20,28,19,0,0,0,[],1,Home,290304006
8,ORL,27,18,21,35,101,36,70,51.4,13,...,5,34,20,0,0,0,[],1,Away,290517002
9,BOS,17,21,23,21,82,29,74,39.2,4,...,17,30,22,0,0,0,[],0,Home,290517002


In [37]:
all_team_stats_2009_to_2018.to_csv('all_team_stats_2009_to_2018.csv')