In [173]:
from bs4 import BeautifulSoup
from collections import defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import requests
import numpy as np

class Player():
    def __init__(self, name, data):
        self.name = name
        self.data = data

In [211]:
# data operations on collected data

def run(date_end_str, boxscores, data_players, simulation=False):
    days_samplesize = 30
    
#     get boxscore info 
#     boxscores = get_boxscores(date_end_str, days_samplesize)

#     get player day from boxscores
#     data_players = get_player_data(boxscores)
    
#     predict spread for all boxscores
    predict_spreads_games(boxscores, data_players, simulation)
    
    

def get_player_data(boxscores): 
    #     initialize dictionary of player data
    data_players = defaultdict(Player)
    
    for box in boxscores:
        extract_player_data(box, dataset=data_players)
        
    print ("player calculations complete")
    return data_players


# queries for basketball-reference


def get_boxscores(date_end_str, days_samplesize):
    boxscores = []
    games_all = []
    year_season = '2017'
    date_end = np.datetime64(date_end_str) - np.timedelta64(1, 'D') # date_end needs to be one less the inputed date
    date_start = date_end - np.timedelta64(days_samplesize, 'D')
    
    months_links = get_list_months(year_season)
    data_days = np.arange(date_start, date_end, dtype='datetime64[D]')  # returns array of all days we want
    data_days_str = [str(day) for day in data_days]
    
    for month in months_links:
        print (month)
        games_all.extend(get_list_games(month))

    for game in games_all:
        date_boxscore_raw = game[11:19]
        date_boxscore = datetime.strftime((datetime.strptime(date_boxscore_raw, '%Y%m%d')), "%Y-%m-%d")

#       only add game to array if it's in the sample size we want
        if date_boxscore in data_days_str:
            boxscores.append(get_boxscore(game))
            
    return boxscores

def get_list_months(year): 
    page = requests.get("https://www.basketball-reference.com/leagues/NBA_%s_games.html" % year)
    soup = BeautifulSoup(page.content, "html.parser")
    
    #     get each link for each month in a season (in reversed order, getting newest months first)
    links_month = [link.get('href') for link in reversed(soup.find_all("div", class_="filter")[0].find_all('a'))]
    
    return links_month


def get_list_games(link_month):
    page = requests.get("https://www.basketball-reference.com%s" % link_month)
    soup = BeautifulSoup(page.content, "html.parser")
    headers_game = soup.find_all(attrs={"data-stat":"box_score_text"})
    links_game = [header.find_all('a')[0].get('href') for header in headers_game if len(header.find_all('a')) > 0]

    return reversed(links_game)
        
    
def get_boxscore(link_game):
    page = requests.get("https://www.basketball-reference.com%s" % link_game)
    soup = BeautifulSoup(page.content, "html.parser")
    
    div_teams = soup.find('div', {'class':'scorebox'})
    links_teams = [link.get('href') for link in div_teams.find_all('a', {'itemprop':'name'})]
    
    team_away = links_teams[0].split('/')[2]
    team_home = links_teams[1].split('/')[2]
    
    box_away = soup.find('table', {'id':'box_%s_basic' % team_away.lower()})
    box_home = soup.find('table', {'id':'box_%s_basic' % team_home.lower()})
    
    return [box_away, box_home]
    
    
def extract_player_data(boxscores, dataset=DATA_PLAYERS):
    for box in boxscores:
        rows_players = [row for row in box.find_all("tr") if row != None]

        # we take the last entry off the array b/c it's team totals
        for player in rows_players[:len(rows_players) - 1]:
            player_header = player.find('th', {'data-stat':'player'})

            # ensures we have valid player data by checking for the existence of one stat
            is_valid_player = True if player.find('td', { 'data-stat' : 'mp' }) != None else False

            if is_valid_player:                
                player_id = player_header.get('data-append-csv')
                player_name = player_header.get('csk')
                player_secondsplayed = int(convert_time_to_seconds(player.find('td', { 'data-stat' : 'mp' }).getText()))
                
                if player_secondsplayed > 0:
                    player_fg = int(player.find('td', { 'data-stat' : 'fg' }).getText())
                    player_fga = int(player.find('td', { 'data-stat' : 'fga' }).getText())
                    # player_fg_pct = player.find('td', { 'data-stat' : 'fg_pct' }).getText()
                    player_fg3 = int(player.find('td', { 'data-stat' : 'fg3' }).getText())
                    player_fg3a = int(player.find('td', { 'data-stat' : 'fg3a' }).getText())
                    # player_fg3_pct = player.find('td', { 'data-stat' : 'fg3_pct' }).getText()
                    player_ft = int(player.find('td', { 'data-stat' : 'ft' }).getText())
                    player_fta = int(player.find('td', { 'data-stat' : 'fta' }).getText())
                    # player_ft_pct = player.find('td', { 'data-stat' : 'ft_pct' }).getText()
                    player_orb = int(player.find('td', { 'data-stat' : 'orb' }).getText())
                    player_drb = int(player.find('td', { 'data-stat' : 'drb' }).getText())
                    # player_trb = player.find('td', { 'data-stat' : 'trb' }).getText()
                    player_ast = int(player.find('td', { 'data-stat' : 'ast' }).getText())
                    player_stl = int(player.find('td', { 'data-stat' : 'stl' }).getText())
                    player_blk = int(player.find('td', { 'data-stat' : 'blk' }).getText())
                    player_tov = int(player.find('td', { 'data-stat' : 'tov' }).getText())
                    player_pf = int(player.find('td', { 'data-stat' : 'pf' }).getText())
                    player_pts = int(player.find('td', { 'data-stat' : 'pts' }).getText())

                    player_data_arr = np.array([player_secondsplayed, player_fg, player_fga, player_fg3, player_fg3a, player_ft, player_fta, player_orb, player_drb, player_ast, player_stl, player_blk, player_tov, player_pf, player_pts])

                    if player_id in DATA_PLAYERS:
                        DATA_PLAYERS[player_id].data = DATA_PLAYERS[player_id].data + player_data_arr
                    else:
                        DATA_PLAYERS[player_id] = Player(player_name, player_data_arr)


def predict_spreads_games(boxscores, data_players, simulation):
    predictions = []

    for box in boxscores:
        roster_away = get_roster(box[0])
        roster_home = get_roster(box[1])
        
        print ("%s, %s" % (len(roster_away), len(roster_home)))
        
        
# returns tuple of (roster_away, roster_home) from boxscore
def get_roster(box):
    roster = []

    rows_players = [row for row in box.find_all("tr") if row != None]

    # we take the last entry off the array b/c it's team totals
    for player in rows_players[:len(rows_players) - 1]:
        player_header = player.find('th', {'data-stat':'player'})

        # ensures we have valid player data by checking for the existence of one stat
        is_valid_player = True if player.find('td', { 'data-stat' : 'mp' }) != None else False

        if is_valid_player:                
            player_id = player_header.get('data-append-csv')
            roster.append(player_id)
            
    return roster
                        

def convert_time_to_seconds(str_time):
    time_split = str_time.split(':')
    if len(time_split) < 2:
        return 0
    else:
        return int(time_split[0]) * 60 + int(time_split[1])

In [207]:
#  TESTING CELL
# set vars for testing
date_end_str = '2017-04-01'
days_samplesize = 30

#     get boxscore info 
boxscores = get_boxscores(date_end_str, days_samplesize)

#     get player day from boxscores
data_players = get_player_data(boxscores)

/leagues/NBA_2017_games-june.html
/leagues/NBA_2017_games-may.html
/leagues/NBA_2017_games-april.html
/leagues/NBA_2017_games-march.html
/leagues/NBA_2017_games-february.html
/leagues/NBA_2017_games-january.html
/leagues/NBA_2017_games-december.html
/leagues/NBA_2017_games-november.html
/leagues/NBA_2017_games-october.html
player calculations complete


In [213]:
# get_player_data('2017-04-01')
# print('done')

# for key, value in DATA_PLAYERS.items():
#     print ("%s: %s" % (value.name, value.data))

run('2017-04-01', boxscores, data_players, simulation=True)
print ('done')

8, 9
10, 9
10, 9
11, 10
10, 10
10, 10
12, 13
11, 10
10, 8
12, 9
10, 12
11, 8
12, 11
10, 11
10, 12
9, 9
10, 9
9, 9
12, 8
10, 9
10, 10
9, 12
9, 10
8, 11
13, 12
12, 13
9, 10
12, 13
11, 9
12, 12
12, 9
10, 11
10, 13
12, 8
12, 11
11, 13
9, 9
9, 11
11, 11
11, 13
11, 12
12, 10
9, 11
12, 13
13, 13
13, 13
10, 10
9, 10
9, 10
8, 9
10, 13
9, 9
10, 10
8, 11
10, 10
10, 12
10, 10
10, 12
9, 13
10, 10
11, 11
12, 10
9, 10
11, 13
13, 10
13, 11
11, 9
10, 11
10, 10
10, 10
11, 10
9, 10
12, 11
12, 13
9, 10
9, 11
13, 12
13, 12
12, 11
9, 9
12, 12
11, 10
13, 12
10, 13
10, 10
9, 11
9, 9
9, 9
8, 12
10, 11
10, 11
13, 12
10, 12
12, 12
9, 8
10, 9
9, 10
12, 11
10, 9
10, 9
13, 11
11, 9
10, 11
10, 11
11, 9
11, 10
12, 13
10, 9
13, 13
12, 8
10, 10
11, 12
10, 10
9, 11
10, 10
10, 11
10, 10
11, 12
13, 12
11, 11
10, 10
10, 10
10, 9
11, 11
11, 12
12, 12
11, 13
10, 10
13, 12
10, 12
10, 9
12, 9
13, 12
11, 11
10, 10
9, 9
10, 10
10, 10
9, 9
10, 10
13, 10
10, 11
11, 12
11, 9
11, 13
12, 11
9, 10
11, 10
12, 12
9, 10
13, 11
9, 10
9, 9