In [857]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By

from time import sleep
import re

In [None]:
class OddsScraper:
    def __init__(self, url):
        self.url = url
        self.driver = None
    
    def init_page(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        self.driver = webdriver.Chrome('/opt/homebrew/bin/chromedriver', options = options)
        self.driver.get(self.url)
    
    def navigate_to_menu(self):
        try:
            game_detail_buttons = self.driver.find_elements_by_xpath("//*[contains(@aria-label, 'Game Details')]")
            for game_detail_button in game_detail_buttons:
                self.driver.execute_script("arguments[0].click();", game_detail_button) 

            start = datetime.now()
            while len(self.driver.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")
                                                     ) != len(game_detail_buttons):
                if (datetime.now() - start).seconds > 10:
                    break
                pass
        
            compare_odds = self.driver.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")[0]
            self.driver.execute_script("arguments[0].click();", compare_odds) 

            return True
        except Exception as e:
            if 'no games scheduled' in self.driver.find_element_by_tag_name("h3").text:
                print('no games scheduled')
                return False
            raise e
    
    def isolate_rows(self):
        odds_rows = []
        point_rows = []
        page = BeautifulSoup(self.driver.page_source, 'html.parser')
        
        for row in page.find_all('tr'):
            if (row.td is not None) and (row.span is not None):
                if 'class' in list(row.td.attrs.keys()):
                    if (('game-team' in row.td['class']) or ('game-time' in row.td['class'])
                               ) and ('team-rotation' in row.span['class'][0]):
                        odds_rows.append(row)
                    
            if 'class' in row.attrs:
                try:
                    if row['class'][0] == 'event-card-row':
                        point_rows.append(row)
                except IndexError:
                    pass
        
        return odds_rows, point_rows
    
    def parse_odds(self, rows):
        data = []
        
        for i in range(int(len(rows)/2)):
            first_row = rows[i*2]  # Away team
            second_row = rows[i*2 + 1]  # Home team
            
            first_rows_with_data = [a for a in first_row.find_all('a') if 'rel' in a.attrs.keys()]
            second_rows_with_data = [a for a in second_row.find_all('a') if 'rel' in a.attrs.keys()]
            
            bet_type = BeautifulSoup(first_row.find_all('td')[1].get('data-content'), 'html.parser').find(
                'div', {'data-role': 'chassis'}).get('data-market')
            
            long_date = first_row.find_all('a')[0]['data-value']
            game_name = first_row.find_all('a')[0]['href']
            short_date = first_row.find_all('a')[0].text
            away_team = first_row.find_all('a')[1]['aria-label']
            away_odds = []
            
            for odds_index in range(len(first_rows_with_data)):
                sportsbook = re.findall('(?<=k\.)\w+(?=\.)|(?<=s\/)\w+', 
                                        first_rows_with_data[odds_index]['href'])[0]
                line = re.findall('(?<=\s)\S+(?=\s)', first_rows_with_data[odds_index].span.text)[0]
                
                if (i+1) % 3 == 0:  # Moneyline bets
                    away_odds.append([sportsbook, line, line])
                else:  # Spread / Total bets
                    odds = re.findall('(?<=\s)\S+(?=\s)', first_rows_with_data[odds_index].small.text)[0]
                    away_odds.append([sportsbook, line, odds])
            
            home_team = second_row.find_all('a')[0]['aria-label']
            home_odds = []
            
            for odds_index in range(len(second_rows_with_data)):
                sportsbook = re.findall('(?<=k\.)\w+(?=\.)|(?<=s\/)\w+', 
                                        second_rows_with_data[odds_index]['href'])[0]
                line = re.findall('(?<=\s)\S+(?=\s)', second_rows_with_data[odds_index].span.text)[0]
                
                if (i+1) % 3 == 0:  # Moneyline bets
                    home_odds.append([sportsbook, line, line])
                else:  # Spread / Total bets
                    odds = re.findall('(?<=\s)\S+(?=\s)', second_rows_with_data[odds_index].small.text)[0]
                    home_odds.append([sportsbook, line, odds])
            
            data.append([bet_type, long_date, away_team, 'away', home_team] + sum(away_odds, []))
            data.append([bet_type, long_date, home_team, 'home', away_team] + sum(home_odds, []))
        
        return data
    
    def parse_points(self, rows):
        data = []
        bet_types = ['spread', 'total', 'moneyline']
        
        for i in range(int(len(rows)/2)):
            first_team = rows[i*2]  # Away team
            second_team = rows[i*2 + 1]  # Home team
            
            first_team_name = first_team.find_all('a')[0]['aria-label']
            second_team_name = second_team.find_all('a')[0]['aria-label']
            first_team_points = int(first_team.select('.event-card-score.loss, .event-card-score.win'
                                                     )[0].text.strip())
            second_team_points = int(second_team.select('.event-card-score.loss, .event-card-score.win'
                                                       )[0].text.strip())
            
            
            for bet_type in bet_types:
                if bet_type == 'spread':
                    spread = first_team_points - second_team_points
                    data.append(['spread', first_team_name, spread])
                    data.append(['spread', second_team_name, -spread])
                elif bet_type == 'total':
                    total = first_team_points + second_team_points
                    data.append(['total', first_team_name, total])
                    data.append(['total', second_team_name, total])
                else:
                    if first_team_points > second_team_points:
                        data.append(['moneyline', first_team_name, 1])
                        data.append(['moneyline', second_team_name, 0])
                    else:
                        data.append(['moneyline', first_team_name, 0])
                        data.append(['moneyline', second_team_name, 1])
        
        return data
    
    def scrape_odds(self):
        self.init_page()
        
        if self.navigate_to_menu():
            odds_rows, point_rows = self.isolate_rows()
            odds_data = self.parse_odds(odds_rows)
            point_data = self.parse_points(point_rows)
            
            base_columns = ['type', 'datetime', 'team', 'team_home_away', 'opponent']
            side_columns = [item for sublist in [['bookmaker' + str(i), 'line' + str(i), 'odds' + str(i)] for i 
                                   in np.arange(max((len(odd)-5)/3 for odd in odds_data))] for item in sublist]
            odds_df_prelim = pd.DataFrame(odds_data, columns = base_columns + side_columns)
            

            data = []
            for i, row in odds_df_prelim.iterrows():
                base_data = list(row[base_columns])
                book_data = list(row[side_columns])
                sub_book_data = [book_data[i:i + 3] for i in range(0, len(book_data), 3)]
                for triplet in sub_book_data:
                    data.append(base_data + triplet)
            odds_df = pd.DataFrame(data, columns = base_columns + ['bookmaker', 'line', 'odds'])
            points_df = pd.DataFrame(data = point_data, columns = ['type', 'team', 'calculated_val'])

            results = odds_df.merge(points_df, on=['type', 'team'])
            results['line'] = ['+0' if i == 'PK' else i for i in results['line']]
            results = results[~results['bookmaker'].isnull()]
            
            hits = []
            for _, row in results.iterrows():
                if row['type'] == 'spread':
                    if float(row['line']) + row['calculated_val'] > 0:
                        hits.append(1)
                    elif float(row['line']) + row['calculated_val'] < 0:
                        hits.append(0)
                    else:
                        hits.append('')
                elif row['type'] == 'total':
                    if row['line'][0] == 'o':
                        if row['calculated_val'] > float(row['line'][1:]):
                            hits.append(1)
                        elif row['calculated_val'] < float(row['line'][1:]):
                            hits.append(0)
                        else:
                            hits.append('')
                    if row['line'][0] == 'u':
                        if row['calculated_val'] < float(row['line'][1:]):
                            hits.append(1)
                        elif row['calculated_val'] > float(row['line'][1:]):
                            hits.append(0)
                        else:
                            hits.append('')
                elif row['type'] == 'moneyline':
                    hits.append(row['calculated_val'])
            
            results['hit'] = [0 if i == '' else i for i in hits]
            results['line'] = ['-100' if i == 'even' else i for i in results['line']]
            results['odds'] = [-100 if i == 'even' else int(i) for i in results['odds']]
            results['over_under'] = [i[0] if (i[0] == 'o' or i[0] == 'u') else None for i in results['line']]
            results['line_float'] = [float(i[1:]) if (i[0] == 'o' or i[0] == 'u') else float(i) for i in 
                                             results['line']]
            results['payout_multiplier'] = results['odds'] * results['hit']
            results['payout_multiplier'] = [100 if i == '' else i for i in results['payout_multiplier']]
            results['payout_multiplier'] = [
                (abs(i) + 100) / abs(i) if i < 0 else i / 100 + 1 if i > 0 else 0 
                for i in results['payout_multiplier']
            ]
            results['favorite'] = [1 if i[0] == '-' else 0 if i[0] == '+' else None for i in results['line']]
            
            return results
        
        return None


In [382]:
#url = 'https://www.scoresandodds.com/nba?date=2023-04-26'
url = 'https://www.scoresandodds.com/nba?date=2023-06-02'
#url = 'https://www.scoresandodds.com/nba?date=2023-03-07' troubleeeeeee need to fix later, missing game
#url = 'https://www.scoresandodds.com/nba?date=2023-03-27'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')

In [764]:
%%time
def init_page(url):
    '''
    open url, navigate to menu for BeautifulSoup to access odds
    '''
    # initiate webdriver
    d = webdriver.Chrome('/opt/homebrew/bin/chromedriver')
    d.get(url)
    
    try:
        # isolate and click game_detail button, switch tabs
        game_detail_buttons = d.find_elements_by_xpath("//*[contains(@aria-label, 'Game Details')]")
        for game_detail_button in game_detail_buttons:
            d.execute_script("arguments[0].click();", game_detail_button) 

        # wait for page, and tab from game_detail menu, to load 
        start = datetime.now()
        while len(d.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")) != len(game_detail_buttons):
            if (datetime.now() - start).seconds > 10:
                break
            pass

        # click compare odds tab
        compare_odds = d.find_elements_by_xpath("//*[contains(@id, 'compare-odds-tab')]")[0]
        d.execute_script("arguments[0].click();", compare_odds) 

        return d
    except IndexError:
        error_type = 'some other error'
        if 'no games scheduled' in d.find_element_by_tag_name("h3").text:
            return 'no games today'
        return error_type
    
    return 'not index error'
        
        
    # change bet type
    # bet_type_menu = d.find_elements_by_xpath("//*[contains(@data-content, 'bet-type')]")[0]
    # d.execute_script("arguments[0].click();", bet_type_menu) 

    # while len(d.find_elements_by_xpath("//*[contains(@data-content, '.odds-table-moneyline')]")) == 0:
    #     pass
    # moneyline = d.find_elements_by_xpath("//*[contains(@data-content, '.odds-table-moneyline')]")[0]
    # d.execute_script("arguments[0].click();", moneyline) 

    #.odds-table-total
    
def isolate_rows(page):
    '''
    locate all rows where odds data is kept
    '''
    odds_rows = []
    point_rows = []
    for row in BeautifulSoup(page.page_source, 'html.parser').find_all('tr'):
        if (row.td is not None) and (row.span is not None):
            if 'class' in list(row.td.attrs.keys()):
                if (('game-team' in row.td['class']) or ('game-time' in row.td['class'])
                               ) and ('team-rotation' in row.span['class'][0]):
                    odds_rows.append(row)
        if 'class' in list(row.attrs.keys()):
            try:
                if row['class'][0] == 'event-card-row':
#                     print(row.find_all('a')[0]['aria-label'])
#                     print(row.select('.event-card-score.loss, .event-card-score.win')[0].text.strip())
#                     print(row['data-side'])
                    point_rows.append(row)
            except IndexError:
                pass
    return odds_rows, point_rows

def parse_odds(rows):
    '''
    parse rows for all odds data
    '''
    data = []
    for i in range(int(len(rows)/2)):
        # iterate through pair of rows, every pair is one table, there are 3 tables (spread, total, moneyline)
        first_row = rows[i*2] #away team
        first_rows_with_data = [i for i in first_row.find_all('a') if 'rel' in i.attrs.keys()]
        second_row = rows[i*2 + 1] #home team
        second_rows_with_data = [i for i in second_row.find_all('a') if 'rel' in i.attrs.keys()]
                
        bet_type = BeautifulSoup(first_row.find_all('td')[1].get('data-content'), 'html.parser').find(
                    'div', {'data-role': 'chassis'}).get('data-market')  # get bet type
        long_date = first_row.find_all('a')[0]['data-value']  # date
        game_name = first_row.find_all('a')[0]['href']  # name
        short_date = first_row.find_all('a')[0].text  # shorter date
        away_team = first_row.find_all('a')[1]['aria-label']  # team name
        away_odds = []
        
        for odds_index in range(len(first_rows_with_data)):
            sportsbook = re.findall('(?<=k\.)\w+(?=\.)|(?<=s\/)\w+', first_rows_with_data[odds_index
                                                                                            ]['href'])[0]
            line = re.findall('(?<=\s)\S+(?=\s)', first_rows_with_data[odds_index].span.text)[0]
            if (i+1) % 3 == 0: #moneyline bets
                away_odds.append([sportsbook, line, line])
            else: #spread / total bets
                odds = re.findall('(?<=\s)\S+(?=\s)', first_rows_with_data[odds_index].small.text)[0]
                away_odds.append([sportsbook, line, odds])

        home_team = second_row.find_all('a')[0]['aria-label']
        home_odds = []
        for odds_index in range(len(second_rows_with_data)):
            sportsbook = re.findall('(?<=k\.)\w+(?=\.)|(?<=s\/)\w+', second_rows_with_data[odds_index
                                                                                             ]['href'])[0]
            line = re.findall('(?<=\s)\S+(?=\s)', second_rows_with_data[odds_index].span.text)[0]
            if (i+1) % 3 == 0: #moneyline bets
                home_odds.append([sportsbook, line, line])
            else: #spread / total bets
                odds = re.findall('(?<=\s)\S+(?=\s)', second_rows_with_data[odds_index].small.text)[0]
                home_odds.append([sportsbook, line, odds])

        data.append([bet_type, long_date, away_team, 'away', home_team] + sum(away_odds, []))
        data.append([bet_type, long_date, home_team, 'home', away_team] + sum(home_odds, []))

    return data

def parse_points(rows):
    '''
    parse rows for points data
    calculate spread, total, and which team won
    '''
    data = []
    for i in range(int(len(rows)/2)):
        # iterate through pair of rows, every pair is one table, each pair has two teams from one game
        first_team = rows[i*2] #away team
        second_team = rows[i*2 + 1] #home team
        
        first_team_name = first_team.find_all('a')[0]['aria-label']
        second_team_name = second_team.find_all('a')[0]['aria-label']
        first_team_points = int(first_team.select('.event-card-score.loss, .event-card-score.win')[0
                                                                                ].text.strip())
        second_team_points = int(second_team.select('.event-card-score.loss, .event-card-score.win')[0
                                                                                ].text.strip())
                
        bet_types = ['spread', 'total', 'moneyline']
        for bet_type in bet_types:
            if bet_type == 'spread':
                first_team_points - second_team_points
                data.append(['spread', first_team_name, first_team_points - second_team_points])
                data.append(['spread', second_team_name, second_team_points - first_team_points])
            elif bet_type == 'total':
                total = first_team_points + second_team_points
                data.append(['total', first_team_name, total])
                data.append(['total', second_team_name, total])
            else:
                if first_team_points > second_team_points:
                    data.append(['moneyline', first_team_name, 1])
                    data.append(['moneyline', second_team_name, 0])
                else:
                    data.append(['moneyline', first_team_name, 0])
                    data.append(['moneyline', second_team_name, 1])
        
    return data

def create_results(odds, points):
    '''
    convert odds and points into dataframes
    combine these dataframes to obtain data on if bets hit or not and payout
    add additional columns for info
    '''
    base_columns = ['type', 'datetime', 'team', 'team_home_away', 'opponent']
    side_columns = [item for sublist in [['bookmaker' + str(i), 'line' + str(i), 'odds' + str(i)] for i 
                           in np.arange(1, 10)] for item in sublist]
    odds_df_prelim = pd.DataFrame(odds, columns = base_columns + side_columns)

    data = []
    for i, row in odds_df_prelim.iterrows():
        base_data = list(row[base_columns])
        book_data = list(row[side_columns])
        sub_book_data = [book_data[i:i + 3] for i in range(0, len(book_data), 3)]
        for triplet in sub_book_data:
            data.append(base_data + triplet)
    odds_df = pd.DataFrame(data, columns = base_columns + ['bookmaker', 'line', 'odds'])
    points_df = pd.DataFrame(data = points, columns = ['type', 'team', 'calculated_val'])
    results = odds_df.merge(points_df, left_on = ['type', 'team'], right_on = ['type', 'team'])
    results['line'] = ['+0' if i == 'PK' else i for i in results['line']]
    results = results[~results['bookmaker'].isnull()]

    hits = []
    for i, row in results.iterrows():
        if row['type'] == 'spread':
            if float(row['line']) + row['calculated_val'] > 0:
                hits.append(1)
            elif float(row['line']) + row['calculated_val'] < 0:
                hits.append(0)
            else:
                hits.append('')
        elif row['type'] == 'total':
            if row['line'][0] == 'o': #indicates over
                if row['calculated_val'] > float(row['line'][1:]):
                    hits.append(1)
                elif row['calculated_val'] < float(row['line'][1:]):
                    hits.append(0)
                else:
                    hits.append('')
            if row['line'][0] == 'u': #indicates under:
                if row['calculated_val'] < float(row['line'][1:]):
                    hits.append(1)
                elif row['calculated_val'] > float(row['line'][1:]):
                    hits.append(0)
                else:
                    hits.append('')
        elif row['type'] == 'moneyline':
            hits.append(row['calculated_val'])
    results['hit'] = [0 if i == '' else i for i in hits]
    results['line'] = ['-100' if i == 'even' else i for i in results['line']]
    results['odds'] = [-100 if i == 'even' else int(i) for i in results['odds']]
    results['over_under'] = [i[0] if (i[0] == 'o' or i[0] == 'u') else None for i in results['line']]
    results['line_float'] = [float(i[1:]) if (i[0] == 'o' or i[0] == 'u') else float(i) for i in results['line']]
    results['payout_multiplier'] = results['odds'] * results['hit']
    results['payout_multiplier'] = [100 if i == '' else i for i in results['payout_multiplier']]
    results['payout_multiplier'] = [(abs(i)+100)/abs(i) if i < 0 else i/100+1 if i > 0 else 0 
                                for i in results['payout_multiplier']]
    results['favorite'] = [1 if i[0] == '-' else 0 if i[0] == '+' else None for i in results['line']]
    return results


# finished_page = init_page(url)
# odds_rows, point_rows = isolate_rows(finished_page)
# odds = parse_odds(odds_rows)
# points = parse_points(point_rows)
# results_df = create_results(odds, points)
#finished_page.close()

CPU times: user 27 µs, sys: 323 µs, total: 350 µs
Wall time: 1.12 ms


In [765]:
def extract_day(date):
    url = f'https://www.scoresandodds.com/nba?date={date}'
    finished_page = init_page(url)
    if type(finished_page) == str:
        return finished_page
    odds_rows, point_rows = isolate_rows(finished_page)
    odds = parse_odds(odds_rows)
    points = parse_points(point_rows)
    results_df = create_results(odds, points)
    finished_page.close()
    return results_df

In [856]:
season_starts = datetime.strptime("2022-10-18", "%Y-%m-%d")
season_ends = datetime.strptime("2023-04-09", "%Y-%m-%d")
playoff_begins = datetime.strptime("2023-04-11", "%Y-%m-%d")
playoff_ends = datetime.strptime("2023-06-12", "%Y-%m-%d")
date_generated = pd.date_range(begin, end)
dates = list(date_generated.strftime("%Y-%m-%d"))
dates.remove('2023-02-19') #allstar game day

In [863]:
still_fails

['2022-12-25', '2023-02-01', '2023-02-19', '2023-03-06', '2023-03-07']

In [873]:
scraper = OddsScraper('https://www.scoresandodds.com/nba?date=2023-03-06')
#scraper.scrape_odds()
scraper.init_page()
scraper.navigate_to_menu()
test_odds, test_points = scraper.isolate_rows()

In [850]:
%%time
run_this = dates[50:55]
for i in range(len(run_this)):
    url = f'https://www.scoresandodds.com/nba?date={run_this[i]}'
    try:
        scraper = OddsScraper(url)
        scraper.scrape_odds()
        print(f'{i+1}) {run_this[i]} succeeded.')
    except:
        print(f'{i+1}) {run_this[i]} failed.')

1) 2022-12-07 succeeded.
2) 2022-12-08 succeeded.
3) 2022-12-09 succeeded.
4) 2022-12-10 succeeded.
5) 2022-12-11 succeeded.


In [852]:
still_fails

['2022-12-25', '2023-02-01', '2023-02-19', '2023-03-06', '2023-03-07']

In [874]:
test_points

[<tr class="event-card-row" data-side="away"> <td> <div> <span class="team-rotation"> 521 </span> <span class="team-nameplate"> <span class="team-logo"> <img data-role="imagable" data-src="https://rical-images.s3.amazonaws.com/team-logos/nba/BOS.png" src="https://rical-images.s3.amazonaws.com/team-logos/nba/BOS.png"/> </span> <span class="team-name"> <a aria-label="Celtics" data-abbr="Celtics" href="/nba/teams/celtics"> <span>Celtics</span> </a> <span class="team-record" data-abbr="521"> <span>45-21</span> </span> </span> </span> </div> </td> <td class="event-card-score loss"> 114 </td> <td class="event-card-movements tablet"> <div data-action="toggle" data-content="#game-detail--19140" data-group="game-drawer--19140" data-open="" data-role="openable" data-tab="#line-movements-tab--19140"> <span class="data-value"> u218 </span> <small class="data-odds"> -110 </small> </div> <div data-action="toggle" data-content="#game-detail--19140" data-group="game-drawer--19140" data-open="" data-ro