In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By

from time import sleep
import re

In [241]:
class OddsScraper:
    def __init__(self, date):
        self.url = f'https://www.scoresandodds.com/nba?date={date}'
        self.driver = None
    
    def init_page(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        self.driver = webdriver.Chrome('/opt/homebrew/bin/chromedriver', options = options)
        self.driver.get(self.url)
    
    def navigate_to_menu(self):
        '''
        navigate to betting odds information menu
        '''
        try:
            game_detail_buttons = self.driver.find_elements_by_xpath("//*[contains(@aria-label, 'Game Details')]")
            for game_detail_button in game_detail_buttons:
                self.driver.execute_script("arguments[0].click();", game_detail_button) 
            
            # a small amount of pages have a game that won't load betting odds, bypass after 10 seconds
            start = datetime.now()
            while len(self.driver.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")
                                                     ) != len(game_detail_buttons):
                if (datetime.now() - start).seconds > 10:
                    break
                pass
        
            compare_odds = self.driver.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")[0]
            self.driver.execute_script("arguments[0].click();", compare_odds) 

            return True
        except Exception as e: # exception if the day doesnt have any games
            if 'no games scheduled' in self.driver.find_element_by_tag_name("h3").text:
                print('no games scheduled')
                return False
            raise e
    
    def isolate_rows(self):
        '''
        isolate the rows with information on odds and points
        '''
        odds_rows = []
        point_rows = []
        page = BeautifulSoup(self.driver.page_source, 'html.parser')
        
        for row in page.find_all('tr'):
            if (row.td is not None) and (row.span is not None):
                if 'class' in list(row.td.attrs.keys()):
                    if (('game-team' in row.td['class']) or ('game-time' in row.td['class'])
                               ) and ('team-rotation' in row.span['class'][0]):
                        odds_rows.append(row)
                    
            if 'class' in row.attrs:
                try:
                    if row['class'][0] == 'event-card-row':
                        point_rows.append(row)
                except IndexError:
                    pass
        
        return odds_rows, point_rows
    
    def extract_odds(self, row, row_index):
        rows_with_data = [a for a in row.find_all('a') if 'rel' in a.attrs.keys()]
        odds = []
        for odds_index in range(len(rows_with_data)):
            sportsbook = re.findall('(?<=k\.)\w+(?=\.)|(?<=s\/)\w+', rows_with_data[odds_index]['href'])[0]
            line = re.findall('(?<=\s)\S+(?=\s)', rows_with_data[odds_index].span.text)[0]
            if (row_index+1) % 3 == 0:  # moneyline bets have the same odds as the line
                odds.append([sportsbook, line, line])
            else:  # spread and total bets
                odd = re.findall('(?<=\s)\S+(?=\s)', rows_with_data[odds_index].small.text)[0]
                odds.append([sportsbook, line, odd])
        return odds
    
    def parse_odds(self, rows):
        '''
        extract odds from html
        '''
        data = []
        
        for row_index in range(int(len(rows)/2)):
            first_row = rows[row_index*2]  # away team
            second_row = rows[row_index*2 + 1]  # mome team
            
            bet_type = BeautifulSoup(first_row.find_all('td')[1].get('data-content'), 'html.parser').find(
                'div', {'data-role': 'chassis'}).get('data-market')
            
            long_date = first_row.find_all('a')[0]['data-value']
            game_name = first_row.find_all('a')[0]['href']
            short_date = first_row.find_all('a')[0].text
            away_team = first_row.find_all('a')[1]['aria-label']
            away_odds = self.extract_odds(first_row, row_index)
            
            home_team = second_row.find_all('a')[0]['aria-label']
            home_odds = self.extract_odds(second_row, row_index)
            
            data.append([bet_type, long_date, away_team, 'away', home_team] + sum(away_odds, []))
            data.append([bet_type, long_date, home_team, 'home', away_team] + sum(home_odds, []))
        
        return data
    
    def parse_points(self, rows):
        '''
        extract points and calculate spread, total, and winner values
        '''
        data = []
        bet_types = ['spread', 'total', 'moneyline']
        
        for i in range(int(len(rows)/2)):
            first_team = rows[i*2]  # Away team
            second_team = rows[i*2 + 1]  # Home team
            
            first_team_name = first_team.find_all('a')[0]['aria-label']
            second_team_name = second_team.find_all('a')[0]['aria-label']
            first_team_points = int(first_team.select('.event-card-score.loss, .event-card-score.win'
                                                     )[0].text.strip())
            second_team_points = int(second_team.select('.event-card-score.loss, .event-card-score.win'
                                                       )[0].text.strip())
            
            
            for bet_type in bet_types:
                if bet_type == 'spread':
                    spread = first_team_points - second_team_points
                    data.append(['spread', first_team_name, spread])
                    data.append(['spread', second_team_name, -spread])
                elif bet_type == 'total':
                    total = first_team_points + second_team_points
                    data.append(['total', first_team_name, total])
                    data.append(['total', second_team_name, total])
                else:
                    if first_team_points > second_team_points:
                        data.append(['moneyline', first_team_name, 1])
                        data.append(['moneyline', second_team_name, 0])
                    else:
                        data.append(['moneyline', first_team_name, 0])
                        data.append(['moneyline', second_team_name, 1])
        
        return data
    
    def filter_event_card_scores(self, tags):
        '''
        filter out games that have been cancelled or postponed according to website
        such games are missing 'event-card-score' class or have a point value of 0
        '''
        filtered_tags = []
        for tag in tags:
            elements = tag.find_all(class_=lambda value: value and 'event-card-score' in value.split())
            
            for element in elements:
                if element.text.strip() != '0':
                    filtered_tags.append(tag)

        return filtered_tags

    
    def scrape_odds(self):
        '''
        driver function
        '''
        self.init_page()
        
        if self.navigate_to_menu():
            odds_rows, point_rows = self.isolate_rows()
            odds_data = self.parse_odds(odds_rows)
            point_data = self.parse_points(self.filter_event_card_scores(point_rows))
            
            base_columns = ['type', 'datetime', 'team', 'team_home_away', 'opponent']
            side_columns = [item for sublist in [['bookmaker' + str(i), 'line' + str(i), 'odds' + str(i)] for i 
                                   in np.arange(max((len(odd)-5)/3 for odd in odds_data))] for item in sublist]
            odds_df_prelim = pd.DataFrame(odds_data, columns = base_columns + side_columns)
            

            data = []
            for i, row in odds_df_prelim.iterrows():
                base_data = list(row[base_columns])
                book_data = list(row[side_columns])
                sub_book_data = [book_data[i:i + 3] for i in range(0, len(book_data), 3)]
                for triplet in sub_book_data:
                    data.append(base_data + triplet)
            odds_df = pd.DataFrame(data, columns = base_columns + ['bookmaker', 'line', 'odds'])
            points_df = pd.DataFrame(data = point_data, columns = ['type', 'team', 'calculated_val'])

            results = odds_df.merge(points_df, on=['type', 'team'])
            results['line'] = ['+0' if i == 'PK' else i for i in results['line']]
            results = results[~results['bookmaker'].isnull()]
            
            hits = []
            for _, row in results.iterrows():
                if row['type'] == 'spread':
                    if float(row['line']) + row['calculated_val'] > 0:
                        hits.append(1)
                    elif float(row['line']) + row['calculated_val'] < 0:
                        hits.append(0)
                    else:
                        hits.append('')
                elif row['type'] == 'total':
                    if row['line'][0] == 'o':
                        if row['calculated_val'] > float(row['line'][1:]):
                            hits.append(1)
                        elif row['calculated_val'] < float(row['line'][1:]):
                            hits.append(0)
                        else:
                            hits.append('')
                    if row['line'][0] == 'u':
                        if row['calculated_val'] < float(row['line'][1:]):
                            hits.append(1)
                        elif row['calculated_val'] > float(row['line'][1:]):
                            hits.append(0)
                        else:
                            hits.append('')
                elif row['type'] == 'moneyline':
                    hits.append(row['calculated_val'])
            
            results['hit'] = [0 if i == '' else i for i in hits]
            results['line'] = ['-100' if i == 'even' else i for i in results['line']]
            results['odds'] = [-100 if i == 'even' else int(i) for i in results['odds']]
            results['over_under'] = [i[0] if (i[0] == 'o' or i[0] == 'u') else None for i in results['line']]
            results['line_float'] = [float(i[1:]) if (i[0] == 'o' or i[0] == 'u') else float(i) for i in 
                                             results['line']]
            results['payout_multiplier'] = [100 if i == '' else i for i in (results['odds'] * results['hit'])]
            results['payout_multiplier'] = [
                (abs(i) + 100) / abs(i) if i < 0 else i / 100 + 1 if i > 0 else 0 
                for i in results['payout_multiplier']
            ]
            results['payout_multiplier'] = [1 if hits[i] == '' else list(results['payout_multiplier'])[i] 
                                        for i in range(len(results['payout_multiplier']))]
            results['favorite'] = [1 if i[0] == '-' else 0 if i[0] == '+' else None for i in results['line']]
            self.driver.quit()
            return results
        
        return None
    
def printProgressBar(iteration, total, prefix = '', suffix = '', 
                     decimals = 1, length = 40, fill = '█', printEnd = "\r"):
    """
    loop to create terminal progress bar
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    if iteration == total: 
        print()
        

def create_dates(season, reg_szn_start, reg_szn_end, 
                 playoff_szn_start, playoff_szn_end, 
                 allstar_date, other_remove_dates = []):
    reg_szn_generated = pd.date_range(datetime.strptime(reg_szn_start, "%Y-%m-%d"), 
                                         datetime.strptime(reg_szn_end, "%Y-%m-%d"))
    reg_szn_dates = list(reg_szn_generated.strftime("%Y-%m-%d"))
    reg_szn_dates.remove(allstar_date)
    
    playoff_szn_generated = pd.date_range(datetime.strptime(playoff_szn_start, "%Y-%m-%d"), 
                                         datetime.strptime(playoff_szn_end, "%Y-%m-%d"))
    playoff_szn_dates = list(playoff_szn_generated.strftime("%Y-%m-%d"))
    dates = reg_szn_dates + playoff_szn_dates
    for remove_date in other_remove_dates:
        dates.remove(remove_date)
    return season, dates, playoff_szn_start
    
    
_21_22_szn = create_dates("2021-2022", "2021-10-19", "2022-04-10",
                          "2022-04-16", "2022-06-16", "2022-02-20")

_22_23_szn = create_dates("2022-2023", "2022-10-18", "2023-04-09",
                          "2023-04-15", "2023-06-12", "2023-02-19",
                          ["2023-05-24", "2023-05-26", "2023-05-28"])

test = create_dates("2022-2023", "2023-02-17", "2023-02-20",
                          "2023-05-24", "2023-05-28", "2023-02-19",
                          ["2023-05-24", "2023-05-26", "2023-05-28"])

season_lsts = [_21_22_szn, _22_23_szn]

# combine all dfs
def full_output(season_lsts = season_lsts):
    output_dfs = []
    idx, total_dates = 0, sum([len(i[1]) for i in season_lsts])
    for season_lst in season_lsts:
        season_year = season_lst[0]
        season_dates = season_lst[1]
        playoff_starts = season_lst[2]
        
        for date in season_dates:
            print(f'{idx + 1}) {date} / {total_dates - idx - 1} left')
            idx += 1
            scraper = OddsScraper(date)
            odds = scraper.scrape_odds()
            if odds is not None:
                odds['season'] = season_year
                if datetime.strptime(date, "%Y-%m-%d") >= datetime.strptime(playoff_starts, "%Y-%m-%d"):
                    odds['reg_or_playoff'] = 'playoff'
                else:
                    odds['reg_or_playoff'] = 'reg'
            output_dfs.append(odds)
    output = pd.concat(output_dfs, ignore_index = True)
    return output

In [244]:
%%time
output = full_output()

1) 2021-10-19 / 463 left
2) 2021-10-20 / 462 left
3) 2021-10-21 / 461 left
4) 2021-10-22 / 460 left
5) 2021-10-23 / 459 left
6) 2021-10-24 / 458 left
7) 2021-10-25 / 457 left
8) 2021-10-26 / 456 left
9) 2021-10-27 / 455 left
10) 2021-10-28 / 454 left
11) 2021-10-29 / 453 left
12) 2021-10-30 / 452 left
13) 2021-10-31 / 451 left
14) 2021-11-01 / 450 left
15) 2021-11-02 / 449 left
16) 2021-11-03 / 448 left
17) 2021-11-04 / 447 left
18) 2021-11-05 / 446 left
19) 2021-11-06 / 445 left
20) 2021-11-07 / 444 left
21) 2021-11-08 / 443 left
22) 2021-11-09 / 442 left
23) 2021-11-10 / 441 left
24) 2021-11-11 / 440 left
25) 2021-11-12 / 439 left
26) 2021-11-13 / 438 left
27) 2021-11-14 / 437 left
28) 2021-11-15 / 436 left
29) 2021-11-16 / 435 left
30) 2021-11-17 / 434 left
31) 2021-11-18 / 433 left
32) 2021-11-19 / 432 left
33) 2021-11-20 / 431 left
34) 2021-11-21 / 430 left
35) 2021-11-22 / 429 left
36) 2021-11-23 / 428 left
37) 2021-11-24 / 427 left
38) 2021-11-25 / 426 left
no games scheduled
39

290) 2022-12-11 / 174 left
291) 2022-12-12 / 173 left
292) 2022-12-13 / 172 left
293) 2022-12-14 / 171 left
294) 2022-12-15 / 170 left
295) 2022-12-16 / 169 left
296) 2022-12-17 / 168 left
297) 2022-12-18 / 167 left
298) 2022-12-19 / 166 left
299) 2022-12-20 / 165 left
300) 2022-12-21 / 164 left
301) 2022-12-22 / 163 left
302) 2022-12-23 / 162 left
303) 2022-12-24 / 161 left
no games scheduled
304) 2022-12-25 / 160 left
305) 2022-12-26 / 159 left
306) 2022-12-27 / 158 left
307) 2022-12-28 / 157 left
308) 2022-12-29 / 156 left
309) 2022-12-30 / 155 left
310) 2022-12-31 / 154 left
311) 2023-01-01 / 153 left
312) 2023-01-02 / 152 left
313) 2023-01-03 / 151 left
314) 2023-01-04 / 150 left
315) 2023-01-05 / 149 left
316) 2023-01-06 / 148 left
317) 2023-01-07 / 147 left
318) 2023-01-08 / 146 left
319) 2023-01-09 / 145 left
320) 2023-01-10 / 144 left
321) 2023-01-11 / 143 left
322) 2023-01-12 / 142 left
323) 2023-01-13 / 141 left
324) 2023-01-14 / 140 left
325) 2023-01-15 / 139 left
326) 2023

In [245]:
output.to_csv('nbaodds.csv')

Unnamed: 0,type,datetime,team,team_home_away,opponent,bookmaker,line,odds,calculated_val,hit,over_under,line_float,payout_multiplier,favorite,season,reg_or_playoff
0,spread,2021-10-20T00:30:00Z,Nets,away,Bucks,betmgm,+1.5,-110.0,-23,0.0,,1.5,0.000000,0.0,2021-2022,reg
1,spread,2021-10-20T00:30:00Z,Nets,away,Bucks,draftkings,+1.5,-105.0,-23,0.0,,1.5,0.000000,0.0,2021-2022,reg
2,spread,2021-10-20T00:30:00Z,Nets,away,Bucks,fanduel,+1.5,-106.0,-23,0.0,,1.5,0.000000,0.0,2021-2022,reg
3,spread,2021-10-20T00:30:00Z,Nets,away,Bucks,caesars,+1.5,-110.0,-23,0.0,,1.5,0.000000,0.0,2021-2022,reg
4,spread,2021-10-20T00:30:00Z,Nets,away,Bucks,pointsbet,+1.5,-110.0,-23,0.0,,1.5,0.000000,0.0,2021-2022,reg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123137,moneyline,2023-06-13T00:30:00Z,Nuggets,home,Heat,draftkings,-340,-340.0,1,1.0,,-340.0,1.294118,1.0,2022-2023,playoff
123138,moneyline,2023-06-13T00:30:00Z,Nuggets,home,Heat,riverscasino,-345,-345.0,1,1.0,,-345.0,1.289855,1.0,2022-2023,playoff
123139,moneyline,2023-06-13T00:30:00Z,Nuggets,home,Heat,sportsillustrated,-350,-350.0,1,1.0,,-350.0,1.285714,1.0,2022-2023,playoff
123140,moneyline,2023-06-13T00:30:00Z,Nuggets,home,Heat,wynnbet,-355,-355.0,1,1.0,,-355.0,1.281690,1.0,2022-2023,playoff
