In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By

from time import sleep
import re

In [87]:
class OddsScraper:
    def __init__(self, date):
        self.url = f'https://www.scoresandodds.com/nba?date={date}'
        self.driver = None
    
    def init_page(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        self.driver = webdriver.Chrome('/opt/homebrew/bin/chromedriver', options = options)
        self.driver.get(self.url)
    
    def navigate_to_menu(self):
        '''
        navigate to betting odds information menu
        '''
        try:
            game_detail_buttons = self.driver.find_elements_by_xpath("//*[contains(@aria-label, 'Game Details')]")
            for game_detail_button in game_detail_buttons:
                self.driver.execute_script("arguments[0].click();", game_detail_button) 
            
            # a small amount of pages have a game that won't load betting odds, bypass after 10 seconds
            start = datetime.now()
            while len(self.driver.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")
                                                     ) != len(game_detail_buttons):
                if (datetime.now() - start).seconds > 10:
                    break
                pass
        
            compare_odds = self.driver.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")[0]
            self.driver.execute_script("arguments[0].click();", compare_odds) 

            return True
        except Exception as e: # exception if the day doesnt have any games
            if 'no games scheduled' in self.driver.find_element_by_tag_name("h3").text:
                print('no games scheduled')
                return False
            raise e
    
    def isolate_rows(self):
        '''
        isolate the rows with information on odds and points
        '''
        odds_rows = []
        point_rows = []
        page = BeautifulSoup(self.driver.page_source, 'html.parser')
        
        for row in page.find_all('tr'):
            if (row.td is not None) and (row.span is not None):
                if 'class' in list(row.td.attrs.keys()):
                    if (('game-team' in row.td['class']) or ('game-time' in row.td['class'])
                               ) and ('team-rotation' in row.span['class'][0]):
                        odds_rows.append(row)
                    
            if 'class' in row.attrs:
                try:
                    if row['class'][0] == 'event-card-row':
                        point_rows.append(row)
                except IndexError:
                    pass
        
        return odds_rows, point_rows
    
    def parse_odds(self, rows):
        '''
        extract odds from html
        '''
        data = []
        
        for i in range(int(len(rows)/2)):
            first_row = rows[i*2]  # away team
            second_row = rows[i*2 + 1]  # mome team
            
            first_rows_with_data = [a for a in first_row.find_all('a') if 'rel' in a.attrs.keys()]
            second_rows_with_data = [a for a in second_row.find_all('a') if 'rel' in a.attrs.keys()]
            
            bet_type = BeautifulSoup(first_row.find_all('td')[1].get('data-content'), 'html.parser').find(
                'div', {'data-role': 'chassis'}).get('data-market')
            
            long_date = first_row.find_all('a')[0]['data-value']
            game_name = first_row.find_all('a')[0]['href']
            short_date = first_row.find_all('a')[0].text
            away_team = first_row.find_all('a')[1]['aria-label']
            away_odds = []
            
            for odds_index in range(len(first_rows_with_data)):
                sportsbook = re.findall('(?<=k\.)\w+(?=\.)|(?<=s\/)\w+', 
                                        first_rows_with_data[odds_index]['href'])[0]
                line = re.findall('(?<=\s)\S+(?=\s)', first_rows_with_data[odds_index].span.text)[0]
                
                if (i+1) % 3 == 0:  # moneyline bets have the same odds as the line
                    away_odds.append([sportsbook, line, line])
                else:  # spread and total bets
                    odds = re.findall('(?<=\s)\S+(?=\s)', first_rows_with_data[odds_index].small.text)[0]
                    away_odds.append([sportsbook, line, odds])
            
            home_team = second_row.find_all('a')[0]['aria-label']
            home_odds = []
            
            for odds_index in range(len(second_rows_with_data)):
                sportsbook = re.findall('(?<=k\.)\w+(?=\.)|(?<=s\/)\w+', 
                                        second_rows_with_data[odds_index]['href'])[0]
                line = re.findall('(?<=\s)\S+(?=\s)', second_rows_with_data[odds_index].span.text)[0]
                
                if (i+1) % 3 == 0:
                    home_odds.append([sportsbook, line, line])
                else:
                    odds = re.findall('(?<=\s)\S+(?=\s)', second_rows_with_data[odds_index].small.text)[0]
                    home_odds.append([sportsbook, line, odds])
            
            data.append([bet_type, long_date, away_team, 'away', home_team] + sum(away_odds, []))
            data.append([bet_type, long_date, home_team, 'home', away_team] + sum(home_odds, []))
        
        return data
    
    def parse_points(self, rows):
        '''
        extract points and calculate spread, total, and winner values
        '''
        data = []
        bet_types = ['spread', 'total', 'moneyline']
        
        for i in range(int(len(rows)/2)):
            first_team = rows[i*2]  # Away team
            second_team = rows[i*2 + 1]  # Home team
            
            first_team_name = first_team.find_all('a')[0]['aria-label']
            second_team_name = second_team.find_all('a')[0]['aria-label']
            first_team_points = int(first_team.select('.event-card-score.loss, .event-card-score.win'
                                                     )[0].text.strip())
            second_team_points = int(second_team.select('.event-card-score.loss, .event-card-score.win'
                                                       )[0].text.strip())
            
            
            for bet_type in bet_types:
                if bet_type == 'spread':
                    spread = first_team_points - second_team_points
                    data.append(['spread', first_team_name, spread])
                    data.append(['spread', second_team_name, -spread])
                elif bet_type == 'total':
                    total = first_team_points + second_team_points
                    data.append(['total', first_team_name, total])
                    data.append(['total', second_team_name, total])
                else:
                    if first_team_points > second_team_points:
                        data.append(['moneyline', first_team_name, 1])
                        data.append(['moneyline', second_team_name, 0])
                    else:
                        data.append(['moneyline', first_team_name, 0])
                        data.append(['moneyline', second_team_name, 1])
        
        return data
    
    def filter_event_card_scores(self, tags):
        '''
        filter out games that have been cancelled or postponed according to website
        such games are missing 'event-card-score' class or have a point value of 0
        '''
        filtered_tags = []
        for tag in tags:
            elements = tag.find_all(class_=lambda value: value and 'event-card-score' in value.split())
            
            for element in elements:
                if element.text.strip() != '0':
                    filtered_tags.append(tag)

        return filtered_tags

    
    def scrape_odds(self):
        '''
        driver function
        '''
        self.init_page()
        
        if self.navigate_to_menu():
            odds_rows, point_rows = self.isolate_rows()
            odds_data = self.parse_odds(odds_rows)
            point_data = self.parse_points(self.filter_event_card_scores(point_rows))
            
            base_columns = ['type', 'datetime', 'team', 'team_home_away', 'opponent']
            side_columns = [item for sublist in [['bookmaker' + str(i), 'line' + str(i), 'odds' + str(i)] for i 
                                   in np.arange(max((len(odd)-5)/3 for odd in odds_data))] for item in sublist]
            odds_df_prelim = pd.DataFrame(odds_data, columns = base_columns + side_columns)
            

            data = []
            for i, row in odds_df_prelim.iterrows():
                base_data = list(row[base_columns])
                book_data = list(row[side_columns])
                sub_book_data = [book_data[i:i + 3] for i in range(0, len(book_data), 3)]
                for triplet in sub_book_data:
                    data.append(base_data + triplet)
            odds_df = pd.DataFrame(data, columns = base_columns + ['bookmaker', 'line', 'odds'])
            points_df = pd.DataFrame(data = point_data, columns = ['type', 'team', 'calculated_val'])

            results = odds_df.merge(points_df, on=['type', 'team'])
            results['line'] = ['+0' if i == 'PK' else i for i in results['line']]
            results = results[~results['bookmaker'].isnull()]
            
            hits = []
            for _, row in results.iterrows():
                if row['type'] == 'spread':
                    if float(row['line']) + row['calculated_val'] > 0:
                        hits.append(1)
                    elif float(row['line']) + row['calculated_val'] < 0:
                        hits.append(0)
                    else:
                        hits.append('')
                elif row['type'] == 'total':
                    if row['line'][0] == 'o':
                        if row['calculated_val'] > float(row['line'][1:]):
                            hits.append(1)
                        elif row['calculated_val'] < float(row['line'][1:]):
                            hits.append(0)
                        else:
                            hits.append('')
                    if row['line'][0] == 'u':
                        if row['calculated_val'] < float(row['line'][1:]):
                            hits.append(1)
                        elif row['calculated_val'] > float(row['line'][1:]):
                            hits.append(0)
                        else:
                            hits.append('')
                elif row['type'] == 'moneyline':
                    hits.append(row['calculated_val'])
            
            results['hit'] = [0 if i == '' else i for i in hits]
            results['line'] = ['-100' if i == 'even' else i for i in results['line']]
            results['odds'] = [-100 if i == 'even' else int(i) for i in results['odds']]
            results['over_under'] = [i[0] if (i[0] == 'o' or i[0] == 'u') else None for i in results['line']]
            results['line_float'] = [float(i[1:]) if (i[0] == 'o' or i[0] == 'u') else float(i) for i in 
                                             results['line']]
            results['payout_multiplier'] = [100 if i == '' else i for i in (results['odds'] * results['hit'])]
            results['payout_multiplier'] = [
                (abs(i) + 100) / abs(i) if i < 0 else i / 100 + 1 if i > 0 else 0 
                for i in results['payout_multiplier']
            ]
            results['payout_multiplier'] = [1 if hits[i] == '' else list(results['payout_multiplier'])[i] 
                                        for i in range(len(results['payout_multiplier']))]
            results['favorite'] = [1 if i[0] == '-' else 0 if i[0] == '+' else None for i in results['line']]
            self.driver.quit()
            return results
        
        return None

# initiate start and end dates of 2021-2022 regular season
reg_season_21_22_starts = datetime.strptime("2021-10-19", "%Y-%m-%d")
reg_season_21_22_ends = datetime.strptime("2022-04-10", "%Y-%m-%d")
reg_season_21_22_gen = pd.date_range(reg_season_21_22_starts, reg_season_21_22_ends)
reg_season_21_22_dates = list(reg_season_21_22_gen.strftime("%Y-%m-%d"))
reg_season_21_22_dates.remove('2022-02-20') #remove allstar game day

# initiate start and end dates of 2021-2022 playoffs
playoff_21_22_begins = datetime.strptime("2022-04-16", "%Y-%m-%d")
playoff_21_22_ends = datetime.strptime("2022-06-16", "%Y-%m-%d")
playoff_21_22_gen = pd.date_range(playoff_21_22_begins, playoff_21_22_ends)
playoff_21_22_dates = list(playoff_21_22_gen.strftime("%Y-%m-%d"))

# initiate start and end dates of 2022-2023 regular season
reg_season_22_23_starts = datetime.strptime("2022-10-18", "%Y-%m-%d")
reg_season_22_23_ends = datetime.strptime("2023-04-09", "%Y-%m-%d")
reg_season_22_23_gen = pd.date_range(reg_season_22_23_starts, reg_season_22_23_ends)
reg_season_22_23_dates = list(reg_season_22_23_gen.strftime("%Y-%m-%d"))
reg_season_22_23_dates.remove('2023-02-19') #remove allstar game day

# initiate start and end dates of 2022-2023 playoffs
playoff_22_23_begins = datetime.strptime("2023-04-15", "%Y-%m-%d")
playoff_22_23_ends = datetime.strptime("2023-06-12", "%Y-%m-%d")
playoff_22_23_gen = pd.date_range(playoff_22_23_begins, playoff_22_23_ends)
playoff_22_23_dates = list(playoff_22_23_gen.strftime("%Y-%m-%d"))
for date in ['2023-05-24', '2023-05-26', '2023-05-28']: # remove uncatchable errors
    playoff_22_23_dates.remove(date)

# combine all dfs
def full_output():
    output_dfs = []
    for date in reg_season_21_22_dates+playoff_21_22_dates+reg_season_22_23_dates+playoff_22_23_dates:
        scraper = OddsScraper(date)
        odds = scraper.scrape_odds()
        output_dfs.append(odds)
    output = pd.concat(output_dfs, ignore_index = True)
    return output

In [83]:
reg_season_21_22_starts = datetime.strptime("2021-10-19", "%Y-%m-%d")
reg_season_21_22_ends = datetime.strptime("2022-04-10", "%Y-%m-%d")
reg_season_21_22_gen = pd.date_range(reg_season_21_22_starts, reg_season_21_22_ends)
reg_season_21_22_dates = list(reg_season_21_22_gen.strftime("%Y-%m-%d"))
reg_season_21_22_dates.remove('2022-02-20') #allstar game day

playoff_21_22_begins = datetime.strptime("2022-04-16", "%Y-%m-%d")
playoff_21_22_ends = datetime.strptime("2022-06-16", "%Y-%m-%d")
playoff_21_22_gen = pd.date_range(playoff_21_22_begins, playoff_21_22_ends)
playoff_21_22_dates = list(playoff_21_22_gen.strftime("%Y-%m-%d"))

In [85]:
for i in range(len(playoff_21_22_dates)):
    try:
        scraper = OddsScraper(playoff_21_22_dates[i])
        odds = scraper.scrape_odds()
        print(f'{i+1}) {playoff_21_22_dates[i]} succeeded.')
    except:
        print(f'{i+1}) {playoff_21_22_dates[i]} failed.')

1) 2022-04-16 succeeded.
2) 2022-04-17 succeeded.
3) 2022-04-18 succeeded.
4) 2022-04-19 succeeded.
5) 2022-04-20 succeeded.
6) 2022-04-21 succeeded.
7) 2022-04-22 succeeded.
8) 2022-04-23 succeeded.
9) 2022-04-24 succeeded.
10) 2022-04-25 succeeded.
11) 2022-04-26 succeeded.
12) 2022-04-27 succeeded.
13) 2022-04-28 succeeded.
14) 2022-04-29 succeeded.
no games scheduled
15) 2022-04-30 succeeded.
16) 2022-05-01 succeeded.
17) 2022-05-02 succeeded.
18) 2022-05-03 succeeded.
19) 2022-05-04 succeeded.
no games scheduled
20) 2022-05-05 succeeded.
21) 2022-05-06 succeeded.
22) 2022-05-07 succeeded.
23) 2022-05-08 succeeded.
24) 2022-05-09 succeeded.
25) 2022-05-10 succeeded.
26) 2022-05-11 succeeded.
27) 2022-05-12 succeeded.
28) 2022-05-13 succeeded.
no games scheduled
29) 2022-05-14 succeeded.
30) 2022-05-15 succeeded.
no games scheduled
31) 2022-05-16 succeeded.
32) 2022-05-17 succeeded.
33) 2022-05-18 succeeded.
34) 2022-05-19 succeeded.
35) 2022-05-20 succeeded.
36) 2022-05-21 succeede

In [89]:
scraper = OddsScraper('2022-06-16')
odds = scraper.scrape_odds()

In [90]:
odds

Unnamed: 0,type,datetime,team,team_home_away,opponent,bookmaker,line,odds,calculated_val,hit,over_under,line_float,payout_multiplier,favorite
0,spread,2022-06-17T01:00:00Z,Warriors,away,Celtics,betmgm,+3.5,-105,13,1,,3.5,1.952381,0.0
1,spread,2022-06-17T01:00:00Z,Warriors,away,Celtics,draftkings,+4,-110,13,1,,4.0,1.909091,0.0
2,spread,2022-06-17T01:00:00Z,Warriors,away,Celtics,fanduel,+3.5,-106,13,1,,3.5,1.943396,0.0
3,spread,2022-06-17T01:00:00Z,Warriors,away,Celtics,caesars,+4,-110,13,1,,4.0,1.909091,0.0
4,spread,2022-06-17T01:00:00Z,Warriors,away,Celtics,pointsbet,+3.5,-110,13,1,,3.5,1.909091,0.0
5,spread,2022-06-17T01:00:00Z,Warriors,away,Celtics,sugarhouse,+4,-114,13,1,,4.0,1.877193,0.0
6,spread,2022-06-17T01:00:00Z,Warriors,away,Celtics,unibet,+4,-114,13,1,,4.0,1.877193,0.0
7,spread,2022-06-17T01:00:00Z,Celtics,home,Warriors,betmgm,-3.5,-115,-13,0,,-3.5,0.0,1.0
8,spread,2022-06-17T01:00:00Z,Celtics,home,Warriors,draftkings,-4,-110,-13,0,,-4.0,0.0,1.0
9,spread,2022-06-17T01:00:00Z,Celtics,home,Warriors,fanduel,-3.5,-114,-13,0,,-3.5,0.0,1.0
