In [2]:
import requests
from bs4 import BeautifulSoup
import datetime
from datetime import date
import time
import numpy as np
import pandas as pd
import os


# Functions to scrape spreads and moneylines from sportsbookreview.com

In [8]:
# unique id for different sports books (columns in spreads table)
books = {238:'pinnacle',
        19:'5dimes',
        93:'bookmaker',
        1096:'betonline',
        999996:'bovada',
        169:'heritage',
        180:'intertops',
        139:'youwager',
        1275:'justbet'
        }

In [9]:
def format_spread_and_odds(data):
    
    away_data = data[0].get_text()
    
    if 'PK' in away_data:
        away_data = [0, away_data[2:]]
    else:
        away_data = away_data.strip().replace(u'\xbd','.5').split('\xa0')
    
    home_data = data[1].get_text()
    
    if 'PK' in home_data:
        home_data = [0, home_data[2:]]
    else:
        home_data = home_data.strip().replace(u'\xbd','.5').split('\xa0')
    
    
    if len(home_data) == len(away_data) == 1:
        home_spread, home_odds, away_spread, away_odds = [np.nan] * 4
    
    elif len(home_data) == 1:
        
        away_spread, away_odds = float(away_data[0]), int(away_data[1])
        home_spread, home_odds = -away_spread, -110
        
    elif len(away_data) == 1:
        
        home_spread, home_odds = float(home_data[0]), int(home_data[1])  
        away_spread, away_odds = -home_spread, -110
        
    else:
        
        home_spread, home_odds = float(home_data[0]), int(home_data[1])    
        away_spread, away_odds = float(away_data[0]), int(away_data[1])
    
    
    return home_spread, home_odds, away_spread, away_odds

In [10]:
def scrape_SBR(date, ml=False):
    
    if type(date) is pd.Timestamp:
        date = str(date.date())

    url_date = date.replace('-','')
    
    if ml:
        url_flag = 'money-line/'
    else:
        url_flag = ''

    
    url = 'https://classic.sportsbookreview.com/betting-odds/nba-basketball/{url_flag}?date={date}'.format(url_flag=url_flag, date=url_date)
    
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
    
    response = requests.get(url, headers=headers)
    
    if not response.ok:
        print("request failed for {date}".format(date=date))
        return
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    betting_table = soup.find_all('div',id='OddsGridModule_5')
    
    if not betting_table:
        print("no data found for {date}".format(date=date))
        return
    
    betting_table = betting_table[0]
    
    team_data = betting_table.find_all('div', attrs = {'class':'el-div eventLine-team'})
    
    num_games = len(team_data)
    
    day_data = []
    
    for i in range(num_games):
        
        game_data = {}
        
        game_data['date'] = date
        
        home_and_away_teams = team_data[i].find_all('div')

        if len(home_and_away_teams) < 2:
            continue
            
        away_team = home_and_away_teams[0].get_text().strip().lower()
        home_team = home_and_away_teams[1].get_text().strip().lower()
        game_data['home_team'] = home_team
        game_data['away_team'] = away_team
        
        for book_id, book_name in books.items():
            
            betting_data_list = betting_table.find_all('div', attrs = {'class':'el-div eventLine-book', 'rel':book_id})
            
            
            if betting_data_list:
                
                betting_data = betting_data_list[i].find_all('div')
                
                if not ml:
        
                    home_spread, home_odds, away_spread, away_odds = format_spread_and_odds(betting_data)
            
                else:
                    
                    home_ML, away_ML = betting_data[1].get_text(), betting_data[0].get_text()
                    
            
            else:
                
                if not ml:
                
                    home_spread, home_odds, away_spread, away_odds = [np.nan] * 4
                
                else:
                    
                    home_ML, away_ML = [np.nan] * 4
                
            if not ml:    
                game_data['home_'+book_name+'_spread'] = home_spread
                game_data['home_'+book_name+'_odds'] = home_odds

                game_data['away_'+book_name+'_spread'] = away_spread                
                game_data['away_'+book_name+'_odds'] = away_odds
                
            else:
                if home_ML == '':
                    home_ML = np.nan
                if away_ML == '':
                    away_ML = np.nan
                game_data['home_'+book_name+'_ML'] = home_ML
                game_data['away_'+book_name+'_ML'] = away_ML
        
        
        day_data.append(game_data)
    
    return day_data

        
    
    
    

In [38]:
spread_teams_07 = {'atlanta':'ATL','boston':'BOS','charlotte':'CHA','chicago':'CHI','cleveland': "CLE",'dallas':'DAL','denver':'DEN','detroit':'DET',
                   'golden state': 'GSW','houston': 'HOU','indiana': 'IND','l.a. clippers': "LAC",'l.a. lakers': 'LAL','memphis': 'MEM','miami':'MIA',
                   'milwaukee':'MIL','minnesota': 'MIN','brooklyn':'NJN','new orleans':'NOK','new york': 'NYK','orlando':'ORL','philadelphia':'PHI',
                   'phoenix':'PHO','portland':'POR','sacramento':'SAC','san antonio': 'SAS','toronto': 'TOR','utah': 'UTA','washington':'WAS',
                   'la': 'LAC'}

spread_teams_08 = {'atlanta':'ATL','boston':'BOS','charlotte':'CHA','chicago':'CHI','cleveland': "CLE",'dallas':'DAL','denver':'DEN','detroit':'DET',
                   'golden state': 'GSW','houston': 'HOU','indiana': 'IND','l.a. clippers': "LAC",'l.a. lakers': 'LAL','memphis': 'MEM','miami':'MIA',
                   'milwaukee':'MIL','minnesota': 'MIN','brooklyn':'NJN','new orleans':'NOH','new york': 'NYK','orlando':'ORL','philadelphia':'PHI',
                   'phoenix':'PHO','portland':'POR','sacramento':'SAC','san antonio': 'SAS','toronto': 'TOR','utah': 'UTA','washington':'WAS',
                   'la': 'LAC'}

spread_teams_09_12 = {'atlanta':'ATL','boston':'BOS','charlotte':'CHA','chicago':'CHI','cleveland': "CLE",'dallas':'DAL','denver':'DEN','detroit':'DET',
                      'golden state': 'GSW','houston': 'HOU','indiana': 'IND','l.a. clippers': "LAC",'l.a. lakers': 'LAL','memphis': 'MEM','miami':'MIA',
                      'milwaukee':'MIL','minnesota': 'MIN','brooklyn':'NJN','new orleans':'NOH','new york': 'NYK','oklahoma city': 'OKC','orlando':'ORL',
                      'philadelphia':'PHI','phoenix':'PHO','portland':'POR','sacramento':'SAC','san antonio': 'SAS','toronto': 'TOR','utah': 'UTA',
                      'washington':'WAS','la': 'LAC'}

spread_teams_13 = {'atlanta':'ATL','boston':'BOS','charlotte':'CHA','chicago':'CHI','cleveland': "CLE",'dallas':'DAL','denver':'DEN','detroit':'DET',
                   'golden state': 'GSW','houston': 'HOU','indiana': 'IND','l.a. clippers': "LAC",'l.a. lakers': 'LAL','memphis': 'MEM','miami':'MIA',
                   'milwaukee':'MIL','minnesota': 'MIN','brooklyn':'BRK','new orleans':'NOH','new york': 'NYK','oklahoma city': 'OKC','orlando':'ORL',
                   'philadelphia':'PHI','phoenix':'PHO','portland':'POR','sacramento':'SAC','san antonio': 'SAS','toronto': 'TOR','utah': 'UTA',
                   'washington':'WAS','la': 'LAC'}

spread_teams_14 = {'atlanta':'ATL','boston':'BOS','charlotte':'CHA','chicago':'CHI','cleveland': "CLE",'dallas':'DAL','denver':'DEN','detroit':'DET',
                   'golden state': 'GSW','houston': 'HOU','indiana': 'IND','l.a. clippers': "LAC",'l.a. lakers': 'LAL','memphis': 'MEM','miami':'MIA',
                   'milwaukee':'MIL','minnesota': 'MIN','brooklyn':'BRK','new orleans':'NOP','new york': 'NYK','oklahoma city': 'OKC','orlando':'ORL',
                   'philadelphia':'PHI','phoenix':'PHO','portland':'POR','sacramento':'SAC','san antonio': 'SAS','toronto': 'TOR','utah': 'UTA',
                   'washington':'WAS','la': 'LAC'}

spread_teams_15_21 = {'atlanta':'ATL','boston':'BOS','charlotte':'CHO','chicago':'CHI','cleveland': "CLE",'dallas':'DAL','denver':'DEN','detroit':'DET',
                      'golden state': 'GSW','houston': 'HOU','indiana': 'IND','l.a. clippers': "LAC",'l.a. lakers': 'LAL','memphis': 'MEM','miami':'MIA',
                      'milwaukee':'MIL','minnesota': 'MIN','brooklyn':'BRK','new orleans':'NOP','new york': 'NYK','oklahoma city': 'OKC','orlando':'ORL',
                      'philadelphia':'PHI','phoenix':'PHO','portland':'POR','sacramento':'SAC','san antonio': 'SAS','toronto': 'TOR','utah': 'UTA',
                      'washington':'WAS','la': 'LAC'}


In [3]:
working_dir = '/Users/gregyannett/Documents/nba_game_models/data/datasets/'
all_games = pd.read_csv(working_dir+'all_games.csv')
seasons_with_spreads = all_games[all_games['season'] >= 2007]

In [39]:
dates2007 = seasons_with_spreads[seasons_with_spreads['season'] == 2007]['date'].values
dates2008 = seasons_with_spreads[seasons_with_spreads['season'] == 2008]['date'].values
dates2009_2012 = seasons_with_spreads[seasons_with_spreads['season'].isin([2009,2010,2011,2012])]['date'].values
dates2013 = seasons_with_spreads[seasons_with_spreads['season'] == 2013]['date'].values
dates2014 = seasons_with_spreads[seasons_with_spreads['season'] == 2014]['date'].values
dates2015_2021 = seasons_with_spreads[seasons_with_spreads['season'].isin([2015,2016,2017,2018,2019,2020,2021])]['date'].values
dates_by_season = [dates2007, dates2008, dates2009_2012, dates2013, dates2014, dates2015_2021]
teams_by_season = [spread_teams_07, spread_teams_08, spread_teams_09_12, spread_teams_13, spread_teams_14, spread_teams_15_21]

In [40]:
def find_season(date):
    
    for i in range(6):
        
        if date in dates_by_season[i]:
            
            return i
        
def make_id(row):
    
    home = row['home_team']
    away = row['away_team']
    date = row['date']
    
    teams = teams_by_season[find_season(date)]
    
    if not teams.get(home) or not teams.get(away):
        print(row['date'] + row['home_team'])
        return 0
    
    return date + '_' + teams[home] + '_' + teams[away]
    

In [28]:
dates = list(set(seasons_with_spreads.date.values))

# Spreads

In [None]:
#Call the scraping function for each date

all_spreads = []

counter = 0

for date in dates:
    
    spreads = scrape_SBR(date)
    
    if spreads:
        
        all_spreads.extend(spreads)
        
    counter += 1

    if counter % 100 == 0:
        print(counter/len(dates))
        
        
        

In [22]:
spreads = pd.DataFrame(all_spreads)
spreads = spreads.reset_index(drop=True)

In [29]:
#Add unique identifier for each game YYYY-MM-DD_HOME_AWAY

ids = []
for i, r in spreads.iterrows():
    ids.append(make_id(r))
    
spreads.insert(0, 'id', ids)

In [71]:
#Ensure all spreads and odds are floats
spreads.iloc[:,4:] = spreads.iloc[:,4:].astype(float)

#Drop rows where all betting data is missing (695 games have missing data)
spreads = spreads[~spreads.iloc[:, 4:].isna().all(1)].reset_index(drop=True).copy()

In [73]:
working_dir = '/Users/gregyannett/Documents/nba_game_models/data/datasets/'
# spreads.to_csv(working_dir + 'all_spreads.csv', index=False)

In [134]:
spreads

Unnamed: 0,id,date,home_team,away_team,home_pinnacle_spread,home_pinnacle_odds,away_pinnacle_spread,away_pinnacle_odds,home_5dimes_spread,home_5dimes_odds,...,away_youwager_spread,away_youwager_odds,home_justbet_spread,home_justbet_odds,away_justbet_spread,away_justbet_odds,average_home_spread,average_home_odds,average_away_spread,average_away_odds
0,2017-01-24_TOR_SAS,2017-01-24,toronto,san antonio,-2.5,-113.0,2.5,102.0,-2.5,-109.0,...,2.500000,-110.0,-2.500000,-115.0,2.500000,-105.0,-2.500000,-110.0,2.500000,-84.0
1,2017-01-24_ORL_CHI,2017-01-24,orlando,chicago,3.0,-109.0,-3.0,-101.0,3.0,-110.0,...,-2.500000,-110.0,3.000000,-105.0,-3.000000,-115.0,2.937500,-108.0,-2.937500,-110.0
2,2017-01-24_PHI_LAC,2017-01-24,philadelphia,l.a. clippers,4.5,-110.0,-4.5,-100.0,4.5,-109.0,...,-4.500000,-110.0,4.500000,-110.0,-4.500000,-110.0,4.555556,-109.0,-4.555556,-108.0
3,2017-01-24_WAS_BOS,2017-01-24,washington,boston,-1.0,-107.0,1.0,-103.0,-1.0,-107.0,...,1.000000,-110.0,0.000000,-110.0,0.000000,-110.0,-0.777778,-108.0,0.777778,-108.0
4,2017-01-24_DEN_UTA,2017-01-24,denver,utah,-2.0,-105.0,2.0,-105.0,-2.0,-110.0,...,2.000000,-110.0,-2.000000,-115.0,2.000000,-105.0,-1.944444,-111.0,1.944444,-107.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15628,2021-02-04_DAL_GSW,2021-02-04,dallas,golden state,-4.5,-107.0,4.5,-103.0,-4.0,-109.0,...,4.000000,-110.0,-4.187500,-110.0,4.187500,-106.0,-4.187500,-110.0,4.187500,-106.0
15629,2021-02-04_ATL_UTA,2021-02-04,atlanta,utah,10.0,-109.0,-10.0,-102.0,10.0,-107.0,...,-10.083333,-106.0,10.000000,-110.0,-10.000000,-110.0,10.083333,-111.0,-10.083333,-106.0
15630,2021-02-04_PHI_POR,2021-02-04,philadelphia,portland,-10.0,-104.0,10.0,-106.0,-10.0,-107.0,...,10.062500,-108.0,-10.000000,-110.0,10.000000,-110.0,-10.062500,-109.0,10.062500,-108.0
15631,2021-02-04_MEM_HOU,2021-02-04,memphis,houston,-3.5,-111.0,3.5,100.0,-3.5,-108.0,...,3.714286,-78.0,-3.714286,-110.0,3.714286,-78.0,-3.714286,-110.0,3.714286,-78.0


### Impute missing data with average spread/average odds

In [41]:
spreads = pd.read_csv(working_dir + 'all_spreads.csv')

In [43]:
average_home_spreads = spreads.loc[:, (spreads.columns.str.contains('home')) & (spreads.columns.str.contains('spread'))].mean(axis=1)
average_home_odds = spreads.loc[:, (spreads.columns.str.contains('home')) & (spreads.columns.str.contains('odds'))].mean(axis=1).round()

average_away_spreads = spreads.loc[:, (spreads.columns.str.contains('away')) & (spreads.columns.str.contains('spread'))].mean(axis=1)
average_away_odds = spreads.loc[:, (spreads.columns.str.contains('away')) & (spreads.columns.str.contains('odds'))].mean(axis=1).round()


spreads['average_home_spread'] = average_home_spreads
spreads['average_home_odds'] = average_home_odds

spreads['average_away_spread'] = average_away_spreads
spreads['average_away_odds'] = average_away_odds

In [44]:
for col in spreads.iloc[:,4:].columns:
    if 'home' in col:
        if 'spread' in col:
            spreads[col] = spreads[col].fillna(spreads['average_home_spread'])
        else:
            spreads[col] = spreads[col].fillna(spreads['average_home_odds'])
            
    else:
        if 'spread' in col:
            spreads[col] = spreads[col].fillna(spreads['average_away_spread'])
        else:
            spreads[col] = spreads[col].fillna(spreads['average_away_odds'])

In [48]:
# spreads.to_csv(working_dir + 'all_spreads.csv', index=False)

# Moneylines

In [54]:
#Call the scraping function for each date

all_MLs = []

counter = 0

for date in dates:
    
    MLs = scrape_SBR(date, ml=True)
    
    if MLs:
        
        all_MLs.extend(MLs)
        
    counter += 1

    if counter % 100 == 0:
        print(counter/len(dates))
        
        
        

0.9900990099009901


In [113]:
moneylines = pd.DataFrame(all_MLs)
moneylines = moneylines.reset_index(drop=True)

In [114]:
#Add unique identifier for each game YYYY-MM-DD_HOME_AWAY

ids = []
for i, r in moneylines.iterrows():
    ids.append(make_id(r))
    
moneylines.insert(0, 'id', ids)

In [118]:
#Ensure all moneylines are floats
moneylines.iloc[:,4:] = moneylines.iloc[:,4:].astype(float)

#Drop rows where all betting data is missing (695 games have missing data)
moneylines = moneylines[~moneylines.iloc[:, 4:].isna().all(1)].reset_index(drop=True).copy()

In [119]:
average_home_ML = moneylines.loc[:,moneylines.columns.str.contains('home') & moneylines.columns.str.contains('ML')].mean(axis=1)
average_away_ML = moneylines.loc[:,moneylines.columns.str.contains('away') & moneylines.columns.str.contains('ML')].mean(axis=1)

moneylines['average_home_ML'] = average_home_ML
moneylines['average_away_ML'] = average_away_ML

In [120]:
for col in moneylines.iloc[:,4:].columns:
    if 'home' in col:
        moneylines[col] = moneylines[col].fillna(moneylines['average_home_ML'])
    else:
        moneylines[col] = moneylines[col].fillna(moneylines['average_away_ML'])

In [131]:
moneylines = moneylines[(~moneylines['average_away_ML'].isna()) & (~moneylines['average_home_ML'].isna())]

In [133]:
# moneylines.to_csv(working_dir + 'all_moneylines.csv', index=False)