In [1]:
import numpy as np
import pandas as pd
import os

# Scrape Gamelogs from Basketball Reference

In [3]:
def scrape_gamelogs(team, year):
    """
    Scrapes and joins the regular and advanced game logs for a given team and year from basketball reference
    
    team - 3 letter team abbreviation 
    year - season to scrape data from
    
    Returns scraped gamelog as a DataFrame
    """
    
    #format url
    base_url = "https://www.basketball-reference.com/teams/" 
    url = base_url + team + '/' + str(year) + '/gamelog'
    
    #scrape regular gamelog
    df = pd.read_html(url, header = 1)[0]
    
    #scrape advanced gamelog
    url += '-advanced'
    df_advanced = pd.read_html(url, header = 1)[0]
    
    #drop duplicate columns before joining gamelogs
    duplicate_cols = list(set(df.columns) & set(df_advanced.columns))
    df_advanced = df_advanced.drop(duplicate_cols, axis=1)

    #join regular and advanced gamelogs
    df = df.join(df_advanced)
    
    return df

In [4]:
def make_id(df, team):
    """
    Creates unique ids for each game played in the format of 
        yyyy-mm-dd_XXX_YYY
        
        where XXX is home team abbrev, YYY is away team abbrev
    """
    team = team.upper()
    ids = np.where(df['home'] == 1, (df['date']+"_"+team+"_"+df['opp']).str.upper(), (df['date']+"_"+df['opp']+"_"+team).str.upper())
    return ids

In [5]:
def clean_gamelog(df, team, year):
    """
    Clean and format the gamelog df
    
    df - gamelog DataFrame
    
    Returns cleaned gamelog df
    """
    
    #lowercase all columns
    df.columns = [x.lower() for x in df.columns]
    
    #rename columns
    df = df.rename({'tm':'pts', 'opp.1':'opp_pts'}, axis = 1)
    
    #create flag for home/away game; home = 1
    df.insert(4, 'home', np.where(df['unnamed: 3'] == '@', 0, 1))
    
    # Add unique id for each game
    df.insert(0, 'id', make_id(df, team))
    
    #replace win/loss column with 1/0 flag; 1 = win
    df['w/l'] = np.where(df['w/l']=='W',1,0)
    
    #drop opponent/unnecessary/empty columns
    cols_to_drop = ['rk', 'g'] + [x for x in df.columns if 'unnamed' in x or '.1' in x]
    df = df.drop(cols_to_drop, axis=1)
    
    #drop the empty rows that bbref use to break up their tables
    divider_rows = [20,21,42,43,64,65,86,87]

    #ignore errors to accomodate shortened seasons
    df = df.drop(divider_rows, errors='ignore')
    
    #convert applicable columns to int/float
    for col in list(df.columns):
        df[col] = df[col].apply(lambda x: pd.to_numeric(x, errors='ignore'))
    
    #create column for game's point differential
    df['pt_diff'] = df['pts'] - df['opp_pts']
    
    df.insert(2, 'season', year)
    
    #reset index
    df = df.reset_index(drop=True)
    
    return df

In [6]:
def get_gamelog(team, year):
    """
    Calls functions to scrape, combine, and clean normal and advanced (regular season) gamelogs for a given team and year
    
    team - 3 letter team abbreviation 
    year - season to scrape data from
    
    Returns cleaned gamelog as a DataFrame
    """
    
    #call gamelog scraper
    gamelog = scrape_gamelogs(team, year)
    
    #call cleaner function
    gamelog = clean_gamelog(gamelog, team, year)

    return gamelog

In [7]:
team_abbreviations = {
 2021: ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2020: ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2019: ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2018: ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2017: ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2016: ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2015: ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2014: ['ATL','BOS','BRK','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2013: ['ATL','BOS','BRK','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOH','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2012: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2011: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2010: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2009: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 2008: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','WAS'],
 2007: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOK','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','WAS'],
 2006: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOK','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','WAS'],
 2005: ['ATL','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','WAS'],
 2004: ['ATL','BOS','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','WAS'],
 2003: ['ATL','BOS','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','WAS'],
 2002: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','WAS'],
 2001: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','VAN','WAS'],
 2000: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','VAN','WAS'],
 1999: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','VAN','WAS'],
 1998: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','VAN','WAS'],
 1997: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','VAN','WSB'],
 1996: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','VAN','WSB'],
 1995: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','UTA','WSB'],
 1994: ['ATL','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MIA','MIL','MIN','NJN','NYK','ORL','PHI','PHO','POR','SAC','SAS','SEA','UTA','WSB']
}

In [None]:
working_dir = '/Users/gregyannett/Documents/nba_game_models/data/gamelogs/'

for year in range(1994,2022):
    os.mkdir(working_dir+'{year}'.format(year=year))
    for team in team_abbreviations[year]: 
        fpath = working_dir + '{year}/{team}_{year}.csv'.format(team=team, year=year)
        get_gamelog(team, year).to_csv(fpath, index=False)
    print(year,'done')