# Neccessary Import Statements

In [6]:
import numpy as np

from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains

import os

from time import sleep

# Scrape Game Results Data

## Define function to get data

In [2]:
# instantiate the webdriver
driver = webdriver.Chrome('/Users/sebas12/Downloads/chromedriver') # path of the Chrome webdriver we are using.

In [3]:
import re
def sorted_nicely( l ):
    """ Sorts the given iterable in the way that is expected.
 
    Required arguments:
    l -- The iterable to be sorted.
 
    """
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key = alphanum_key)

In [4]:
def game_results_downloader(driver_object, current_year , all_months = True , *args , **kwargs):
    """
    Purpose: This function will download all of the spreadsheets associated with each month (either all or 
             certain ones specified by the user) of a given NBA season. After this, it will move each file to
             the correct folder location allowing for clear organization that will help when data is used in 
             the future.
    
    Details: This function can only handle calls for a given season.
            
             The final destination of the downloaded files is /Users/sebas12/Documents/Python/Sports_betting/
             Data/Game_results/current_year.
             
             Note also that this function does not return anything for future use. It just prints out a 
             statement saying that it is done doing what it is set up to do.
    
    Arguments:
        driver_object - Selenium driver class/object which is instantiated before the class is called via a 
                        line of code such as: driver = webdriver.Chrome('/{path to driver}/'). This is what 
                        will be used to navigate to the neccessary webpages where the data to be scraped lives.
        current_year - str object that is of the form i.e., '2018-19' which describes the year that you wish 
                       the function to get data from. 
        all_months - boolean object whose default value is set to True. When set to true, the function will 
                     download ALL of the spreadsheets for each month of the season. If not, it turn to the 
                     keyword arguments that you pass into the function and only download the spreadsheet
                     specified by the months given in the keyword argument.
        *args - any positional arguments that may be needed in future implementations of this function. 
                Current version does NOT support the use of them.
        **kwargs - arguments passed in to the function only when all_months is set to False. The function is
                   set up so that the only form that it will accept is something like: month_one = 'October',
                   month_two = 'November', month_three = 'March'. The keyword arguments themselves (i.e., 
                   month_one) is not what matters thanks to the way you iterate over dictionaries in Python, 
                   but what does is the values you set for them; they have to be strings of the months 
                   themselves with the first letter capitalized and the month(s) spelled out entirely and 
                   correctly.
    
    Useful Resources:
        1. https://www.basketball-reference.com/
        2. Selenium Documentation.
    """
    
    ### First, we download the spreadsheets
    # navigate to webpage.
    initial_html_link = 'https://www.basketball-reference.com/leagues/NBA_20{}_games.html'.format(
                                                        current_year[5::]   )
    driver.get(initial_html_link)
    
    if all_months:
        # instantiate a list of the months:
        months_list = ['October' , 'November' , 'December' , 'January' , 'February' , 'March' , 'April']
        
        # download the October spreadsheet
        element_to_hover_over = driver.find_elements_by_class_name('hasmore')
        ActionChains(driver).move_to_element(element_to_hover_over[-1]).perform()
        buttons = driver.find_elements_by_class_name('tooltip')
        try:
            buttons[2].click()
        except:
            driver.implicitly_wait(10) # seconds
            buttons[2].click()
        
        # do the same for the rest of the months iteratively. 
        for month in months_list[1::]:
            # navigate to the month's webpage
            driver.find_element_by_link_text('{}'.format(month)).click()
            
            # download the month's spreadsheet
            element_to_hover_over = driver.find_elements_by_class_name('hasmore')
            ActionChains(driver).move_to_element(element_to_hover_over[-1]).perform()
            buttons = driver.find_elements_by_class_name('tooltip')
            try:
                buttons[2].click()
            except:
                driver.implicitly_wait(10) # seconds
                buttons[2].click()
    
    else:
        # instantiate a list of the months:
        months_list = list(kwargs.values())
        
        # get data
        for month in months_list:
            # navigate to the month's webpage
            html_link = 'https://www.basketball-reference.com/leagues/NBA_20{}_games-{}.html'.format(
                                                        current_year[5::] ,  months_list[0] )
            driver.get(html_link)
            
            # download the month's spreadsheet
            element_to_hover_over = driver.find_elements_by_class_name('hasmore')
            ActionChains(driver).move_to_element(element_to_hover_over[-1]).perform()
            buttons = driver.find_elements_by_class_name('tooltip')
            try:
                buttons[2].click()
            except:
                driver.implicitly_wait(10) # seconds
                buttons[2].click()
    
    
    ### Second, we rename and move the files for easier use in the future.
    # make script wait for a few seconds while all of the downloaded files load in.
    sleep(10) # the script will wait for 10 seconds before doing anything.
    
    # for sorting purposes, rename the first download file.
    os.rename('/Users/sebas12/Downloads/sportsref_download.xls' , 
              '/Users/sebas12/Downloads/sportsref_download (0).xls')  
    
    # get a list of the file names that were downloaded
    xls_files = []
    for file in os.listdir('/Users/sebas12/Downloads'):
        if file.endswith('.xls'):
            xls_files.append(file)
    # order the files with the function defined in the previous cell.
    xls_files = sorted_nicely(xls_files)
    
    # rename and move the files.
    data_path = '/Users/sebas12/Documents/Python/Sports_betting/Data/Game_results/{}'.format(current_year)
    os.mkdir(data_path)
    for file , month in zip(xls_files , months_list):
        os.rename('/Users/sebas12/Downloads/{}'.format(file) , 
                  '{}/{}_{}.xls'.format(data_path , month , current_year))
        
    return 'Process complete for {} NBA season.'.format(current_year)

## Execute Function

In [6]:
game_results_downloader(driver , '2018-19')

'Process complete for 2018-19 NBA season.'

In [7]:
game_results_downloader(driver , '2017-18')

'Process complete for 2017-18 NBA season.'

In [14]:
game_results_downloader(driver , '2016-17')

'Process complete for 2016-17 NBA season.'

In [15]:
game_results_downloader(driver , '2015-16')

'Process complete for 2015-16 NBA season.'

In [16]:
game_results_downloader(driver , '2014-15')

'Process complete for 2014-15 NBA season.'

In [19]:
game_results_downloader(driver , '2013-14')

'Process complete for 2013-14 NBA season.'

In [20]:
game_results_downloader(driver , '2012-13')

'Process complete for 2012-13 NBA season.'

In [42]:
game_results_downloader(driver , '2011-12' , all_months = False , month_one = 'december' ,
                                                                 month_two = 'january' , 
                                                                 month_three = 'february' , 
                                                                 month_four = 'march' , 
                                                                 month_five = 'april')
    # for some reason, there is NO november data on the website!

'Process complete for 2011-12 NBA season.'

In [26]:
game_results_downloader(driver , '2010-11')

'Process complete for 2010-11 NBA season.'

In [28]:
game_results_downloader(driver , '2009-10')

'Process complete for 2009-10 NBA season.'

In [31]:
game_results_downloader(driver , '2008-09')

'Process complete for 2008-09 NBA season.'

In [32]:
game_results_downloader(driver , '2007-08')

'Process complete for 2007-08 NBA season.'

In [33]:
game_results_downloader(driver , '2006-07')

'Process complete for 2006-07 NBA season.'

In [35]:
game_results_downloader(driver , '2005-06')

'Process complete for 2005-06 NBA season.'

In [36]:
game_results_downloader(driver , '2004-05')

'Process complete for 2004-05 NBA season.'

# Get Team Data

## Define Function

In [66]:
def team_stats_downloader(driver_object, current_year , *args , **kwargs):
    """
    Purpose: Downloads the excel spreadsheet for the specified year which contains team statistics PER 100
             POSSESSIONS for all 30 teams in the NBA. After this, the file is moved to the correct folder 
             location for easy use and access later on.
    
    Details: See the README file in the folder where these files are stored for information about each column
             header.
    
    Arguments:
        driver_object - Selenium driver class/object which is instantiated before the class is called via a 
                        line of code such as: driver = webdriver.Chrome('/{path to driver}/'). This is what 
                        will be used to navigate to the neccessary webpages where the data to be scraped 
                        lives.
        current_year - str object that is of the form i.e., '2018-19' which describes the year that you wish 
                       the function to get data from.
        *args - any positional arguments that may be needed in future implementations of this function. 
                Current version does NOT support the use of them.
        **kwargs - any additional keyword arguments that may be needed in future implementations of this 
                   function. Current version does NOT support the use of them.
        
    
    Useful Resources:
        1. https://www.basketball-reference.com/
        2. Selenium Documentation.
    """
    
    ### Get the data
    # navigate to page of data.
    initial_link = 'https://www.basketball-reference.com/leagues/NBA_20{}.html'.format(current_year[5::])
    driver.get(initial_link)
    hundo = driver.find_element_by_link_text('Team Per 100 Poss Stats')
    driver.execute_script("arguments[0].click();", hundo)
    
    # download the spreadsheet
    driver.implicitly_wait(7)
    buttons = driver.find_elements_by_class_name('tooltip')
    driver.execute_script("arguments[0].click();", buttons[53])
    
    
    ### Redirect file
    # wait for file to completely download
    sleep(10)
    
    # get the file.
    for file in os.listdir('/Users/sebas12/Downloads'):
        if file.endswith('.xls'):
            xls_file = file
    
    # rename and move it.
    data_path = '/Users/sebas12/Documents/Python/Sports_betting/Data/team_data/'
    os.rename('/Users/sebas12/Downloads/{}'.format(xls_file) ,
              '{}/team_data_{}.xls'.format(data_path , current_year)  )
    
    return 'Process complete for {} NBA season.'.format(current_year)

## Execute Function

In [62]:
# instantiate the webdriver
driver = webdriver.Chrome('/Users/sebas12/Downloads/chromedriver')

In [20]:
team_stats_downloader(driver , '2018-19')

'Process complete for 2018-19 NBA season.'

In [64]:
team_stats_downloader(driver , '2017-18')

'Process complete for 2017-18 NBA season.'

In [67]:
team_stats_downloader(driver , '2016-17')

'Process complete for 2016-17 NBA season.'

In [68]:
team_stats_downloader(driver , '2015-16')

'Process complete for 2015-16 NBA season.'

In [69]:
team_stats_downloader(driver , '2014-15')

'Process complete for 2014-15 NBA season.'

In [70]:
team_stats_downloader(driver , '2013-14')

'Process complete for 2013-14 NBA season.'

In [72]:
team_stats_downloader(driver , '2012-13')

'Process complete for 2012-13 NBA season.'

In [74]:
team_stats_downloader(driver , '2011-12')

'Process complete for 2011-12 NBA season.'

In [75]:
team_stats_downloader(driver , '2010-11')

'Process complete for 2010-11 NBA season.'

In [76]:
team_stats_downloader(driver , '2009-10')

'Process complete for 2009-10 NBA season.'

In [77]:
team_stats_downloader(driver , '2008-09')

'Process complete for 2008-09 NBA season.'

In [78]:
team_stats_downloader(driver , '2007-08')

'Process complete for 2007-08 NBA season.'

In [79]:
team_stats_downloader(driver , '2006-07')

'Process complete for 2006-07 NBA season.'

In [80]:
team_stats_downloader(driver , '2005-06')

'Process complete for 2005-06 NBA season.'

In [81]:
team_stats_downloader(driver , '2004-05')

'Process complete for 2004-05 NBA season.'

# Get Player Data

In [2]:
import requests as req

In [3]:
# Example command to get the player data
xml_object = req.get('https://erikberg.com/nba/boxscore/20120621-oklahoma-city-thunder-at-miami-heat.json' ,  
                     headers = {'Authorization' : 'Bearer 49d3aa8d-e03b-4c08-a254-b425761894a4'})
    # Notice the format of the URL to get the boxscore from the XMLstats API: 
    #    https://erikberg.com/nba/boxscore/yearmonthdate-awayteam-hometeam.json
    # This is how we will get the different boxscores for the different NBA teams in a given season.

In [4]:
home_stats = xml_object.json()['home_stats']
away_stats = xml_object.json()['away_stats']

In [5]:
home_stats
    # as we can see here, the data comes in the form of a dictionary!

[{'last_name': 'James',
  'first_name': 'LeBron',
  'display_name': 'LeBron James',
  'position': 'SF',
  'minutes': 44,
  'points': 26,
  'assists': 13,
  'turnovers': 6,
  'steals': 1,
  'blocks': 2,
  'rebounds': 11,
  'field_goals_attempted': 19,
  'field_goals_made': 9,
  'three_point_field_goals_attempted': 3,
  'three_point_field_goals_made': 0,
  'free_throws_attempted': 9,
  'free_throws_made': 8,
  'defensive_rebounds': 7,
  'offensive_rebounds': 4,
  'personal_fouls': 2,
  'team_abbreviation': 'MIA',
  'is_starter': True,
  'three_point_percentage': 0.0,
  'free_throw_percentage': 0.889,
  'field_goal_percentage_string': '47.4',
  'three_point_field_goal_percentage_string': '0.0',
  'free_throw_percentage_string': '88.9',
  'field_goal_percentage': 0.474},
 {'last_name': 'Wade',
  'first_name': 'Dwyane',
  'display_name': 'Dwyane Wade',
  'position': 'SG',
  'minutes': 35,
  'points': 20,
  'assists': 3,
  'turnovers': 1,
  'steals': 2,
  'blocks': 3,
  'rebounds': 8,
  'fie

In [12]:
# some example code to get it into a numpy array, here we are throwing away some information that we don't 
# need for the neural network that we will be training later one!

key_list = list(home_stats[0].keys())  # the keys are the same for all of the dictionaries!
iter_key_list = key_list[4:20:] + key_list[22:24:] + key_list[-1::]
    # throwing away the information we don't need.

big_list_away = [] 
for index , obj in enumerate(away_stats): 
    obj_list = [] 
    for key in iter_key_list: 
        obj_list.append(obj[key]) 
        
        if key == 'field_goal_percentage': 
            big_list_away.append(obj_list)
            
big_list_home = [] 
for index , obj in enumerate(home_stats): 
    obj_list = [] 
    for key in iter_key_list: 
        obj_list.append(obj[key]) 
        
        if key == 'field_goal_percentage': 
            big_list_home.append(obj_list)

In [13]:
big_list_away

[[43, 32, 3, 7, 2, 1, 11, 24, 13, 6, 3, 3, 3, 9, 2, 5, 0.5, 1.0, 0.542],
 [43, 19, 6, 2, 2, 0, 4, 20, 4, 5, 0, 13, 11, 4, 0, 3, 0.0, 0.846, 0.2],
 [26, 9, 0, 0, 0, 2, 4, 9, 3, 0, 0, 4, 3, 0, 4, 3, 0.0, 0.75, 0.333],
 [20, 2, 0, 1, 1, 0, 4, 4, 1, 0, 0, 0, 0, 4, 0, 5, 0.0, 0.0, 0.25],
 [9, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 2, 0.0, 0.0, 0.0],
 [36, 19, 5, 3, 2, 0, 4, 11, 5, 8, 3, 6, 6, 3, 1, 2, 0.375, 1.0, 0.455],
 [29, 11, 3, 0, 0, 0, 4, 7, 4, 6, 3, 0, 0, 3, 1, 4, 0.5, 0.0, 0.571],
 [17, 2, 1, 0, 0, 0, 4, 3, 1, 0, 0, 0, 0, 2, 2, 3, 0.0, 0.0, 0.333],
 [5, 2, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0.0, 0.0, 0.5],
 [5, 2, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0.0, 0.0, 1.0],
 [5, 2, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 2, 0, 1, 0.0, 0.0, 0.5],
 [3, 6, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1.0, 0.0, 1.0]]

In [14]:
boxscore_array = np.array([big_list_home , big_list_away])
    # note that we are making a 3D numpy array with the convention that the first 2D array is the home team
    # and the second one is the away team.
boxscore_array.shape
    # just to make sure everything is okay.

(2, 12, 19)

In [15]:
# We don't want to throw away all of the information of the original dictionary. Namely, the player info in 
# order to be able to say what index corresponds to what player.
indicies_list = []
    # this will be a list of dictionaries. Again, the convention is that the first dictionary is the home team
    # and the second one is the away team.

player_indices = {} 
for index , obj in enumerate(home_stats): 
    player_name = '{}.{}'.format( obj['first_name'][0] , obj['last_name'] ) 
    player_indices[player_name] = index
indicies_list.append(player_indices)

player_indices = {} 
for index , obj in enumerate(away_stats): 
    player_name = '{}.{}'.format( obj['first_name'][0] , obj['last_name'] ) 
    player_indices[player_name] = index
indicies_list.append(player_indices)

print(indicies_list)

[{'L.James': 0, 'D.Wade': 1, 'C.Bosh': 2, 'M.Chalmers': 3, 'S.Battier': 4, 'M.Miller': 5, 'N.Cole': 6, 'U.Haslem': 7, 'J.Jones': 8, 'J.Howard': 9, 'R.Turiaf': 10, 'T.Harris': 11}, {'K.Durant': 0, 'R.Westbrook': 1, 'S.Ibaka': 2, 'K.Perkins': 3, 'T.Sefolosha': 4, 'J.Harden': 5, 'D.Fisher': 6, 'N.Collison': 7, 'D.Cook': 8, 'C.Aldrich': 9, 'L.Hayward': 10, 'R.Ivey': 11}]


# Get Odds Data