### Modules

In [78]:
import re
import pandas as pd
from datetime import datetime, timedelta

##  Selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

### Selenium Driver Setup
Download your browser's WebDriver first in order to webscrape using Selenium.

In this notebook, the Microsoft Edge WebDriver is used. The repo also contains this driver.

In [79]:
##  Change to WebDriver path
webdriver_path = 'webdriver/msedgedriver.exe'

driver = webdriver.Edge(service = Service(webdriver_path))

### Functions

#### Helper functions

In [80]:
##  Determine if string contains numbers
def num_there(s):
    return any(i.isdigit() for i in s)

## Determine if string contains text
def text_there(s):
    return any(i.isalpha() for i in s)

#### Function for webscraping

In [81]:
def get_odds(url, game_bool, win_only_bool):
    """
    Inputs:
    ----------
    url: String containing URL to be webscraped
    game_bool: Boolean indicating if specific games should be webscraped
    (True if yes, False if not)
    win_only_bool: Boolean indicating if win-only odds should be webscraped
    (True if yes, False if not)
    """

    driver.get(url)
    
    ####################################################
    ##  Get win-only bets 
    ####################################################
    
    if win_only_bool:
        
        ##  Wait for elements to load, if any
        win_only_class = 'rank-event-section.event_path-content.event-container'
        try:
            WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, win_only_class)))
            
            ## Click all 'View all buttons'
            view_all_class = 'show-all-rank-events.rank-event-market.event.flex-space-between'
            view_all_lst = driver.find_elements(By.CLASS_NAME, view_all_class)
            for button in view_all_lst:
                button.click()
   
            ##  Scrape and clean data
            win_only_markets = []
            win_only_events = []
            win_only_odds = []
            win_only_market_lst = driver.find_elements(By.CLASS_NAME, win_only_class)
            for market in win_only_market_lst:
                market_data = market.text
                market_data = re.sub('Order|View Less', '', market_data)
                market_data_lst = re.split('\\n', market_data)
                market_data_lst = [elem for elem in market_data_lst if elem]  ## Remove empty entries
            
                for i in range(len(market_data_lst)):
                    if (i % 2 == 0) and (i != 0):
                        win_only_events += [market_data_lst[i]]
                        win_only_markets += [market_data_lst[0]]
                    elif (i % 2 == 1) and (i != 1):
                        win_only_odds += [market_data_lst[i]]
                                    
            ##  Create dataframe for win-only bets
            win_only_df = pd.DataFrame(list(zip(win_only_markets, win_only_events, win_only_odds)),
                                       columns = ['Market', 'Event/Person/Team', 'Odds'])
        
            ##  Add update timestamp
            ##  Note that pd.to_datetime('now') returns UTC time, which we want to convert to Manila time
            win_only_df['Update timestamp'] = pd.to_datetime('now')
            win_only_df['Update timestamp'] = win_only_df['Update timestamp'].dt.tz_localize('UTC')  
            win_only_df['Update timestamp'] = win_only_df['Update timestamp'].dt.tz_convert('Asia/Manila')
            win_only_df['Update timestamp'] = win_only_df['Update timestamp'].dt.strftime('%Y-%m-%d %H:%M')
           
        except (TimeoutException, NoSuchElementException) as e:
            win_only_df = 'No win-only events found'
    else:
        win_only_df = 'Data not scraped'
        
    ####################################################
    ##  Get specific game bets
    ####################################################

    if game_bool:
        game_class = 'event_path-content.asian-event-path-component'
        try:
            WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, game_class)))
        
            ## Open separate game pages in new tabs
            ## Note: WSM website does not accept Ctrl+Click or Shift+Click to open new tabs

            all_game_df = pd.DataFrame()
            game_link_class = 'opponent-name.bold'
            game_link_lst = [link.get_attribute('href') for link in driver.find_elements(By.CLASS_NAME, game_link_class)]
            if not game_link_lst:
                game_link_lst = [link.get_attribute('href') for link in driver.find_elements(By.CLASS_NAME, 'opponent-name')]
            
            for link in game_link_lst:
                driver.implicitly_wait(10)
                driver.get(link)
            
                ## Wait for elements to load 
                game_market_class = 'markets-group-component' ## Contains all market data
                game_title_class = 'event_path-title.ellipsis.rollup-title.x.collapsed'  ## Contains title for opening/closing menus
                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.CLASS_NAME, game_market_class)))
                
                ## Get game info, if game is live
                ## Note: Info about actual start time is not in website; so current time is used as proxy 
                if 'live' in link:
                    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, 'live-event')))
                    game_title = driver.find_element(By.CLASS_NAME, 'live-event').text
                    game_time = datetime.now().strftime('%Y/%m/%d %H:%M')  
                    
                ## Get game info if game is NOT live
                else:
                    game_info = driver.find_element(By.CLASS_NAME, 'event-header-description')
                    game_info_lst = game_info.text.splitlines()
                    game_title = game_info_lst[0]
                    game_time = game_info_lst[1]
        
                    ## Convert game time into specific date and time
                    game_time = re.sub('Starts - | -|', '', game_time)
                    game_time = re.split(' ', game_time)
            
                    ## Change relative dates to actual dates
                    if game_time[0] == 'Today':
                        game_time[0] = datetime.today()
                    elif game_time[0] == 'Tomorrow':
                        game_time[0] = datetime.today() + timedelta(days = 1)
                    else:
                        days = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 
                                3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'} 
                        required_day = game_time[0]
                        current_date = datetime.today()
                        while days[current_date.weekday()] != required_day:
                            current_date = current_date + timedelta(days=1)
                        game_time[0] = current_date
                    game_time[0] = game_time[0].date()
                
                    ## Convert time into datetime format
                    game_time[1] = datetime.strptime(game_time[1], '%H:%M')
                    game_time[1] = game_time[1].time()
            
                    ## Combine date and time
                    game_time = datetime.combine(game_time[0], game_time[1])
                    game_time = game_time.strftime('%Y-%m-%d %H:%M')

                ## Scrape Match markets in each page
                game_markets_title_lst = []
                game_events_lst = []
                game_odds_lst = []
                game_market_lst = driver.find_elements(By.CLASS_NAME, game_market_class)
            
                for game_market in game_market_lst[0:5]:
                    game_market_data = game_market.text
                    game_market_data = re.sub('HOME AWAY', '', game_market_data)  ## Remove for Winning Margin markets
                    game_market_data_lst = game_market_data.splitlines()
                    game_market_data_lst = [elem for elem in game_market_data_lst if elem]  ## Remove empty entries

                    for i in range(len(game_market_data_lst)):
                        if i != 0:
                            ## For Points-Match market, code is slightly different
                            if game_market_data_lst[0] == 'Points - Match':
                                if text_there(game_market_data_lst[i]):
                                    if 'Over' in game_market_data_lst[i]:
                                        game_events_lst += [game_market_data_lst[i-1] + ' ' + game_market_data_lst[i]]
                                        game_markets_title_lst += [game_market_data_lst[0]]
                                    elif 'Under' in game_market_data_lst[i]: 
                                        game_events_lst += [game_market_data_lst[i-3] + ' ' + game_market_data_lst[i]]
                                        game_markets_title_lst += [game_market_data_lst[0]]
                                    
                                if num_there(game_market_data_lst[i]) and not text_there(game_market_data_lst[i]):
                                    if i + 1 < len(game_market_data_lst) and text_there(game_market_data_lst[i+1]):
                                        game_odds_lst += [game_market_data_lst[i]]
                                    else:
                                        game_odds_lst += [game_market_data_lst[i]] 
                            
                            ## Other markets follow roughly the same format
                            else:
                                if text_there(game_market_data_lst[i]):
                                    if i + 2 < len(game_market_data_lst) and not text_there(game_market_data_lst[i+2]): 
                                        game_events_lst += [game_market_data_lst[i] + ' ' + game_market_data_lst[i+1]]
                                        game_markets_title_lst += [game_market_data_lst[0]]
                                    else:
                                        game_events_lst += [game_market_data_lst[i]]
                                        game_markets_title_lst += [game_market_data_lst[0]]
                    
                                if num_there(game_market_data_lst[i]):
                                    if i + 1 < len(game_market_data_lst) and text_there(game_market_data_lst[i+1]):
                                        game_odds_lst += [game_market_data_lst[i]]
                                    elif i + 1 == len(game_market_data_lst):
                                        game_odds_lst += [game_market_data_lst[i]]  ## Last element will always be bet odds

                ## Create list of game titles and time
                game_title_lst = sorted([game_title] * len(game_events_lst))
                game_time_lst = sorted([game_time] * len(game_events_lst))
            
                ## Create dataframe for each game
                game_df = pd.DataFrame(list(zip(game_title_lst, game_time_lst, game_markets_title_lst, 
                                                game_events_lst, game_odds_lst)),
                                       columns = ['Game', 'Datetime', 'Market', 'Event/Person/Team', 'Odds'])
                
                ## Append to dataframe for all games
                all_game_df = pd.concat([all_game_df, game_df], ignore_index = True)
                driver.implicitly_wait(10)
                    
            ##  Add update timestamp
            ##  Note that pd.to_datetime('now') returns UTC time, which we want to convert to Manila time
            all_game_df['Update timestamp'] = pd.to_datetime('now')
            all_game_df['Update timestamp'] = all_game_df['Update timestamp'].dt.tz_localize('UTC')  
            all_game_df['Update timestamp'] = all_game_df['Update timestamp'].dt.tz_convert('Asia/Manila')
            all_game_df['Update timestamp'] = all_game_df['Update timestamp'].dt.strftime('%Y-%m-%d %H:%M')
        
        except (TimeoutException, NoSuchElementException) as e:
            win_only_df = 'No games found'
            
    else:
        all_game_df = 'Data not scraped'
            
    return win_only_df, all_game_df

### Webscraping

In [82]:
## List URLs to scrape
msw_url_lst = ['https://sports.msw.ph/en/sports/227-basketball/75477-philippines']

In [83]:
## Indicate whether to scrape markets or not
## Win-only: Markets that do not have separate pages
## Game: Markets with separate pages 
scrape_win_only = True
scrape_game = True

## Initialize dataframes
if scrape_win_only:
    win_only_df = pd.DataFrame()
if scrape_game:
    game_df = pd.DataFrame()

In [84]:
for url in msw_url_lst:
    [win, game] = get_odds(url, scrape_win_only, scrape_game)

In [85]:
## Webscrape URLs in list
for url in msw_url_lst:
    [win, game] = get_odds(url, scrape_win_only, scrape_game)
    
    if scrape_win_only and type(win) != str:
        win_only_df = pd.concat([win_only_df, win])
    else: 
        pass
    
    if scrape_game and type(game) != str:
        game_df = pd.concat([game_df, game])
    else:
        pass

### Saving data
Dataframes can either be saved as new files, or be appended to an already existing file.  

Make sure to comment lines not used, especially when running this script at regular intervals.

#### Save as new file/s

In [86]:
## Change paths as needed
#win_only_path = 'data/pba_wo_mid2022.csv'
#game_path = 'data/pba_game_mid2022.csv'

## Save as CSVs
#win_only_df.to_csv(f'{win_only_path}', index = False)
#game_df.to_csv(f'{game_path}', index = False)

#### Append to existing files

In [88]:
## Change paths as needed
# win_only_hist_path = 'data/pba_wo_mid2022.csv'
game_hist_path = 'data/pba_game_mid2022.csv'

## Load existing CSVs
# win_only_hist = pd.read_csv(f'{win_only_hist_path}', index_col = None)
game_hist = pd.read_csv(f'{game_hist_path}', index_col = None)

## Append new data
# win_only_hist = pd.concat([win_only_hist, win_only_df])
game_hist = pd.concat([game_hist, game_df])

## Save updated files as CSVs
# win_only_hist.to_csv(f'{win_only_hist_path}', index = False)
game_hist.to_csv(f'{game_hist_path}', index = False)

In [92]:
datetime.now().strftime('%Y/%m/%d %H:%M')

'2022/06/19 18:52'