In [1]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import patsy
import datetime as dt
import time
import warnings
import datetime
warnings.filterwarnings('ignore')

In [2]:
def get_training_data(player, extension):
    base_url = "http://www.basketball-reference.com"
    letter = extension[:1]
    player_url = base_url+"/players/"+letter+"/"+extension+".html"

    #Follow the URL and extract the additional data needed
    r = requests.get(player_url)

    b = BeautifulSoup(r.text, "html.parser")
    
    uls = b.find_all("ul")

    game_logs = {}

    for ul in uls:
        a = ul.find_all("a", href=True)
        for link in a:
            regex = ".*/gamelog/(2017|2018)"
                
            if re.search(regex, link['href']):
                if re.search(".*/2016", link['href']):
                    season = 2016
                elif re.search("./2017", link['href']):
                    season = 2017
                elif re.search("./2018", link['href']):
                    season = 2018
                value = link['href']
                key = season
                game_logs[key]=value

        player_log = pd.DataFrame()
        buffer_df = pd.DataFrame()

    for key in game_logs:
        url = base_url + game_logs[key]
        try:
            buffer_df = get_log_data(url, key, player)
            player_log = player_log.append(buffer_df)
        except:
            #We have found a year log that does not exist for this player and it is likely that the preceeding years
            #also do not exist...
            print (getDateTime() + " - ERROR:  This is not an active player.")
            buffer_df = pd.DataFrame()
            break

    #player_log.head()    
    return player_log

"""
DraftKings Daily Fantasy Basketball Scoring
Point	1
Assist	1.5
Steal	2
Block	2
3 Point Shot Made	0.50
Rebound	1.25
Turnover	-0.50
Double Double	1.5
Triple Double	3
"""
def calculate_draftkings_score(row):
    double_double = 0.0
    triple_double = 0.0
    
    if (row.PTS >= 10.0 and row.TRB >= 10.0) or (row.PTS >= 10.0 and row.AST >= 10.0) or \
            (row.PTS >= 10.0 and row.STL >= 10.0) or (row.PTS >= 10.0 and row.BLK >= 10.0) or \
            (row.AST >= 10.0 and row.TRB >= 10.0) or (row.AST >= 10.0 and row.BLK >= 10.0) or \
            (row.AST >= 10.0 and row.STL >= 10.0) or (row.TRB >= 10.0 and row.BLK >= 10.0) or \
            (row.TRB >= 10.0 and row.STL >= 10.0) or (row.STL >= 10.0 and row.BLK >= 10.0):
        double_double = 1.5
    
    if (row.PTS >= 10.0 and row.AST >= 10.0 and row.TRB >= 10.0) or \
            (row.PTS >= 10.0 and row.AST >= 10.0 and row.BLK >= 10.0) or \
            (row.PTS >= 10.0 and row.AST >= 10.0 and row.STL >= 10.0) or \
            (row.PTS >= 10.0 and row.AST >= 10.0 and row.BLK >= 10.0) or \
            (row.AST >= 10.0 and row.TRB >= 10.0 and row.BLK >= 10.0) or \
            (row.AST >= 10.0 and row.TRB >= 10.0 and row.STL >= 10.0) or \
            (row.TRB >= 10.0 and row.BLK >= 10.0 and row.STL >= 10.0):
        triple_double = 3.0
    
    return (row.PTS * 1.0) + \
               (row.AST * 1.5) + \
               (row.STL * 2.0) + \
               (row.THREE * 0.5) + \
               (row.TRB * 1.25) + \
               (row.TOV * -0.5) + \
               (row.BLK * 2.0) + \
               double_double + \
               triple_double
                
def convert_minutes_played(row):
    t = row.MP
    m, s = t.split(':')
    
    return float(int(m) * 60 + int(s))

def get_log_data(gamelogs_url, year, player):
    url = gamelogs_url
    r = requests.get(url)
    b = BeautifulSoup(r.text, "html.parser")
    #Find tables tagged with id=pgl_basic
    table = b.find_all('table', {'id': 'pgl_basic'})
    rows = []
    for tbl in table:
        for tr in tbl.find_all('tr'):
            row = []
            for td in tr.find_all('td'):
                row.append(td.text.replace('\n',''))
            if len(row) == 29 and row[0] != "":
                rows.append(row)
            elif len(row) == 28 and row[0] != "":
                row.append(-99)
                rows.append(row)
        
    #print(rows)
    log_data = pd.DataFrame(rows, columns=['G', 'DATE', 'AGE', 'TM', 'HOA',
                                          'OPP', 'RSLT', 'GS', 'MP', 'FG',
                                          'FGA', 'FG_PCT', 'THREE', 'THREE_A', 'THREE_PCT',
                                          'FT', 'FTA', 'FT_PCT', 'ORB', 'DRB', 
                                          'TRB', 'AST', 'STL', 'BLK', 'TOV',
                                          'PF', 'PTS', 'GMSC', 'PM'])

    log_data['season'] = year
    log_data['name'] = player
    log_data.head()

    return log_data

def turn_to_float(val):
    fl = float(val)
    return fl

def turn_to_int(val):
    try:
        i = int(val)
    except:
        return val
    return i

def format_numeric_rows(df):
    h = df.columns
    
    for i in np.arange(1, len(h), 1):
        try:
            df.loc[:, h[i]] = df[h[i]].map(turn_to_float)
        except:
            df.loc[:, h[i]] = df[h[i]]
    return df

def calculate_rolling_means(glog):
    rol_means = pd.DataFrame()
    
    rolling_window = 3
    
    glog.GMSC = glog.GMSC.map(turn_to_float)
    GMSC = glog['GMSC']
    GMSC = (GMSC - GMSC.mean()) / GMSC.std()
    GMSC_rollmean = GMSC.rolling(window=rolling_window).mean()
    rol_means['GMSC'] = GMSC_rollmean
    
    glog.TRB = glog.TRB.map(turn_to_float)
    TRB = glog['TRB']
    TRB = (TRB - TRB.mean()) / TRB.std()
    TRB_rollmean = TRB.rolling(window=rolling_window).mean()
    rol_means['TRB'] = TRB_rollmean

    glog.AST = glog.AST.map(turn_to_float)
    AST = glog['AST']
    AST = (AST - AST.mean()) / AST.std()
    AST_rollmean = AST.rolling(window=rolling_window).mean()
    rol_means['AST'] = AST_rollmean

    glog.STL = glog.STL.map(turn_to_float)
    STL = glog['STL']
    STL = (STL - STL.mean()) / STL.std()
    STL_rollmean = STL.rolling(window=rolling_window).mean()
    rol_means['STL'] = STL_rollmean

    glog.BLK = glog.BLK.map(turn_to_float)
    BLK = glog['BLK']
    BLK = (BLK - BLK.mean()) / BLK.std()
    BLK_rollmean = BLK.rolling(window=rolling_window).mean()
    rol_means['BLK'] = BLK_rollmean

    rol_means["player"] = glog.name[0]
        
    return rol_means

def get_rolling_mean(df, player):
    #print(player)
    #Counter to create and append dataframe
    j = 0
    
    #Get the unique years included in the dataframe per player.
    df_unique_season = df[df['name']==player]
    seasons = list(df_unique_season.season.unique())
    del df_unique_season
    
    for season in seasons:
        if int(season) >= 2017:
            months = []
            days = []

            tmp_game_log = df[df['season'] == float(season)]
            tmp_game_log = tmp_game_log[tmp_game_log['name'] == player]
            tmp_game_log = tmp_game_log[~tmp_game_log['name'].isin([0])]
            tmp_game_log = tmp_game_log.reset_index()

            for i in np.arange(0, tmp_game_log.shape[0], 1):
                gameday = dt.datetime.strptime(tmp_game_log.DATE[i],'%Y-%m-%d')
                month = str(gameday.month)
                day = str(gameday.day)
                months.append(month)
                days.append(day)

            tmp_game_log["toSeason"] = tmp_game_log["season"].astype(int).astype(str) + "-" + months + "-" + days
            tmp_game_log.toSeason = pd.to_datetime(tmp_game_log.toSeason)

            if j == 0:
                game_log = tmp_game_log
                j = j + 1
            else:
                game_log = game_log.append(tmp_game_log, ignore_index=True)

            #Garbage collect
            del tmp_game_log
    
    game_log.set_index('toSeason', inplace=True)
    
    my_rolling_mean = calculate_rolling_means(game_log)
    my_rolling_mean = my_rolling_mean.iloc[-1]
    
    return my_rolling_mean

# Get player list with IDs
def get_player_list(path):
    df_id = pd.read_excel(path)
    df_id['id'] = df_id['CORRECTED_NAME'].str.strip()
    return df_id

def get_player_games(path):
    df_games = pd.read_excel(path)
    df_games['id'] = df_games['parsed_name'].str.strip()
    return df_games

def create_player_dict(df_merged):
    players = {}
    for idx, row in df_merged.iterrows():
        players[row['id']] = row['INDEX']

    return players

def getDateTime():
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")

In [3]:
df_players = get_player_list("nba_player_list.xlsx")

In [4]:
tiers = [['Andre Drummond','Nikola Vucevic','Victor Oladipo','John Wall'],
['Kevin Love','Bradley Beal','DeMar DeRozan','Kyle Lowry']]

In [5]:
total_my_Draft_kings_Scores = []

for tier in tiers:
    
    df_players_games = pd.DataFrame(tier, columns=['id'])
    
    df_merged = pd.merge(df_players, df_players_games, how='inner', left_on=['id'], right_on=['id'])
    players = create_player_dict(df_merged)
    
    i = 0
    l = len(players)
    for player in players:
        print (getDateTime() + " - getting training data for [" + player + "] ...")
        if i == 0:
            df = get_training_data(player, players[player])
        else:
            df = df.append(get_training_data(player, players[player]))
        i += 1

        if i < l:
            time.sleep(3)

    print (getDateTime() + " - finished getting player data ...")
    
    master_df = df
    master_df = format_numeric_rows(df)
    master_df['DK'] = df.apply(calculate_draftkings_score, axis=1)
    master_df['MP_SEC'] = df.apply(convert_minutes_played, axis=1)
    #(Points)+(Rebounds)+(Steals)+(Assists)+(Blocked Shots)- (Turnovers)-(Missed Shots)
    master_df['EFFICIENCY'] = master_df['PTS']+master_df['TRB']+master_df['STL']+master_df['AST']+master_df['BLK']-master_df['TOV']-(master_df['FGA']-master_df['FG'])
    master_df.FT_PCT = master_df.FT_PCT.apply(lambda x: 0.0 if x == '' else x)
    master_df.THREE_PCT = master_df.THREE_PCT.apply(lambda x: 0.0 if x == '' else x)
    master_df.FG_PCT = master_df.FG_PCT.apply(lambda x: 0.0 if x == '' else x)
    master_df.FT_PCT = master_df.FT_PCT.astype(float)
    master_df.THREE_PCT = master_df.THREE_PCT.astype(float)
    master_df.FG_PCT = master_df.FG_PCT.astype(float)
    my_players = master_df.name.unique()
    
    #Create Random Forest Regressor MOdel for DK Score for each player
    useExtra = True
    rfr_buff = None
    rfr = []

    for indx, player in enumerate(my_players):
        print(getDateTime() + " - fitting randomforest model for [" + str(player) + "] ...")
        rfr_buff = RandomForestRegressor(oob_score=True, max_features="auto",
                                        min_samples_leaf=50, n_estimators=400, 
                                        random_state=50, n_jobs=-1)

        rfr.append(rfr_buff)
        player_mask = (master_df.name == player)
        player_df = master_df[player_mask]

        y, X = patsy.dmatrices('DK ~ GMSC+AST+TRB+STL+BLK', data=player_df, return_type='dataframe')

        rfr[indx].fit(X, y)

    print(getDateTime() + " - finished fitting randomforest models ...")
    
    i = 0
    todays_features = []
    for indx, player in enumerate(my_players):
        try:
            print(getDateTime() + " - getting player features for [" + player + "] ...")
            player_features = get_rolling_mean(master_df, player)
            player_features = pd.DataFrame(player_features)
            player_features = player_features.T

            if i ==0:
                todays_features = player_features
            else:
                todays_features = todays_features.append(player_features)
            i += 1

        except Exception as e: 
            print("Error: " + str(e))

    print(getDateTime() + " - finished getting player features ...")
    
    my_Draft_kings_Scores = {"player":[],
                         "score":[]}

    todays_features.fillna(0, inplace=True)

    for indx, player in enumerate(my_players):
        try:
            player_mask = (todays_features.player == player)
            X_pred = todays_features[player_mask]
            X_pred.drop("player", axis=1, inplace=True)
            X_pred['Intercept'] = 1
            player_prediction = rfr[indx].predict(X_pred)
            my_Draft_kings_Scores["player"].append(player)
            my_Draft_kings_Scores["score"].append(player_prediction)
        except Exception as e: print(e)

    my_Draft_kings_Scores = pd.DataFrame(my_Draft_kings_Scores)
    my_Draft_kings_Scores
    
    total_my_Draft_kings_Scores.append(my_Draft_kings_Scores)
    
print (getDateTime() + " - FINISHED ...")

2017-12-27 13:14:43.397872 - getting training data for [John Wall] ...
2017-12-27 13:14:48.990295 - getting training data for [Nikola Vucevic] ...
2017-12-27 13:14:55.036374 - getting training data for [Victor Oladipo] ...
2017-12-27 13:15:00.570689 - getting training data for [Andre Drummond] ...
2017-12-27 13:15:03.088821 - finished getting player data ...
2017-12-27 13:15:03.211479 - fitting randomforest model for [John Wall] ...
2017-12-27 13:15:03.574787 - fitting randomforest model for [Nikola Vucevic] ...
2017-12-27 13:15:03.936185 - fitting randomforest model for [Victor Oladipo] ...
2017-12-27 13:15:04.296948 - fitting randomforest model for [Andre Drummond] ...
2017-12-27 13:15:04.656724 - finished fitting randomforest models ...
2017-12-27 13:15:04.656860 - getting player features for [John Wall] ...
2017-12-27 13:15:04.681359 - getting player features for [Nikola Vucevic] ...
2017-12-27 13:15:04.700456 - getting player features for [Victor Oladipo] ...
2017-12-27 13:15:04.7

In [6]:
for d in total_my_Draft_kings_Scores:
    print (d)

           player            score
0       John Wall  [47.1601113861]
1  Nikola Vucevic  [36.7954587156]
2  Victor Oladipo     [33.0946625]
3  Andre Drummond  [39.3496765351]
          player            score
0     Kyle Lowry    [41.17984375]
1     Kevin Love  [39.3784206989]
2  DeMar DeRozan  [40.6878066038]
3   Bradley Beal   [35.730259009]
