In [4]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import patsy
import datetime as dt
import time
import warnings
import datetime
warnings.filterwarnings('ignore')

In [6]:
def get_training_data(player, extension):
    base_url = "http://www.hockey-reference.com"
    letter = extension[:1]
    player_url = base_url+"/players/"+letter+"/"+extension+".html"

    print(getDateTime() + " - getting training data for [" + player + "] ...")
    
    #Follow the URL and extract the additional data needed
    r = requests.get(player_url)

    b = BeautifulSoup(r.text, "html.parser")
    
    uls = b.find_all("ul")

    game_logs = {}

    for ul in uls:
        a = ul.find_all("a", href=True)
        for link in a:
            regex = ".*/gamelog/(2017|2018)"
                
            if re.search(regex, link['href']):
                if re.search(".*/2016", link['href']):
                    season = 2016
                elif re.search("./2017", link['href']):
                    season = 2017
                elif re.search("./2018", link['href']):
                    season = 2018
                value = link['href']
                key = season
                game_logs[key]=value

        player_log = pd.DataFrame()
        buffer_df = pd.DataFrame()

    for key in game_logs:
        url = base_url + game_logs[key]
        try:
            buffer_df = get_log_data(url, key, player, goalie)
            player_log = player_log.append(buffer_df)
        except:
            #We have found a year log that does not exist for this player and it is likely that the preceeding years
            #also do not exist...
            print(getDateTime() + " - ERROR: This is not an active player.")
            buffer_df = pd.DataFrame()
            break

    return player_log

"""
Point Scoring

Points will be scored in the same way for all DraftKings daily fantasy hockey contests, according to the following schedule:

Goal Scored: 3 Points
Assist: 2 Points
Shot on Goal: 0.5 Points
Blocked Shot: 0.5 Points
Short Handed Point Bonus (Goal/Assist): 1 Points
Shootout Goal: 0.2 Points
Hat Trick Bonus: 1.5 Points

In addition, goalies will score points for the following actions:

Team Win: +3 Points
Save: +0.2 Points
Goal Allowed: -1 Points
Shutout Bonus: +2 Points
"""
def calculate_draftkings_score(row, goalie):
    if not goalie:
        hat_trick_bonus = 0.0
        if row.G >= 3:
            hat_trick_bonus = 1.5
        return (row.G * 3.0) + (row.A * 2.0) + (row.S * 0.5) + (row.BLK * 0.5) + (row.SH_G * 1.0) + (row.SH_A + 1.0) + hat_trick_bonus
    else:
        w_pts = 3.0 if row.DEC == 'W' else 0.0
        sa_pts = row.SA * 0.2
        ga_pts = row.GA * -1.0
        so_bns = 2.0 if row.SO == 1 else 0.0
        return w_pts+sa_pts+ga_pts+so_bns
    
def convert_total_on_ice(row):
    t = row.TOI
    m, s = t.split(':')
    
    return float((int(m) * 1) + (int(s)/60))

def get_log_data(gamelogs_url, season, player, goalie):
    url = gamelogs_url
    r = requests.get(url)
    b = BeautifulSoup(r.text, "html.parser")
    #Find tables tagged with id=gamelog
    table = b.find_all('table', {'id': 'gamelog'})
    rows = []
    for tbl in table:
        for tr in tbl.find_all('tr'):
            row = []
            for td in tr.find_all('td'):
                row.append(td.text.replace('\n',''))
            if not goalie:
                if len(row) == 28 and row[0] != "":
                    rows.append(row)
            else:
                if len(row) == 15 and row[0] != "":
                    rows.append(row)
    if not goalie:
        log_data = pd.DataFrame(rows, columns=['DATE', 'GAME', 'AGE', 'TM', 'HOA',
                                              'OPP', 'RSLT', 'G', 'A', 'PTS',
                                              'PM', 'PIM', 'EV_G', 'PP_G',
                                              'SH_G', 'GW_G', 'EV_A', 'PP_A',
                                              'SH_A', 'S', 'S_PCT', 'SHFT',
                                              'TOI', 'HIT', 'BLK', 'FOW',
                                              'FOL', 'FO_PCT'])
    else:
        log_data = pd.DataFrame(rows, columns=['DATE', 'GAME', 'AGE', 'TM', 'HOA',
                                              'OPP', 'RSLT', 'DEC', 'GA', 'SA', 
                                               'SV', 'SV_PCT', 'SO', 'PIM', 'TOI'])

    log_data['season'] = season
    log_data['name'] = player
    log_data.head()

    return log_data

def turn_to_float(val):
    fl = float(val)
    return fl

def turn_to_int(val):
    try:
        i = int(val)
    except:
        return val
    return i

def format_numeric_rows(df):
    h = df.columns
    
    for i in np.arange(1, len(h), 1):
        try:
            df.loc[:, h[i]] = df[h[i]].map(turn_to_float)
        except:
            df.loc[:, h[i]] = df[h[i]]
    return df

def calculate_rolling_means(glog, goalie):
    rol_means = pd.DataFrame()
    
    rolling_window = 3
    
    if not goalie:
        glog.PPG = glog.PPG.map(turn_to_float)
        PPG = glog['PPG']
        PPG_rollmean = PPG.rolling(window=rolling_window).mean()
        rol_means['PPG'] = PPG_rollmean

        glog.PTS = glog.PTS.map(turn_to_float)
        PTS = glog['PTS']
        PTS_rollmean = PTS.rolling(window=rolling_window).mean()
        rol_means['PTS'] = PTS_rollmean

        glog.S = glog.S.map(turn_to_float)
        S = glog['S']
        S_rollmean = S.rolling(window=rolling_window).mean()
        rol_means['S'] = S_rollmean

        glog.BLK = glog.BLK.map(turn_to_float)
        BLK = glog['BLK']
        BLK_rollmean = BLK.rolling(window=rolling_window).mean()
        rol_means['BLK'] = BLK_rollmean

        glog.G = glog.G.map(turn_to_float)
        G = glog['G']
        G_rollmean = G.rolling(window=rolling_window).mean()
        rol_means['G'] = G_rollmean

        glog.A = glog.A.map(turn_to_float)
        A = glog['A']
        A_rollmean = A.rolling(window=rolling_window).mean()
        rol_means['A'] = A_rollmean
        
    else:
        glog.SV_PCT = glog.SV_PCT.map(turn_to_float)
        SV_PCT = glog['SV_PCT']
        SV_PCT_rollmean = SV_PCT.rolling(window=rolling_window).mean()
        rol_means['SV_PCT'] = SV_PCT_rollmean

        glog.GA = glog.GA.map(turn_to_float)
        GA = glog['GA']
        GA_rollmean = GA.rolling(window=rolling_window).mean()
        rol_means['GA'] = GA_rollmean
        
        glog.dec_L = glog.dec_L.map(turn_to_float)
        dec_L = glog['dec_L']
        dec_L_rollmean = dec_L.rolling(window=rolling_window).mean()
        rol_means['dec_L'] = dec_L_rollmean

        glog.dec_W = glog.dec_W.map(turn_to_float)
        dec_W = glog['dec_W']
        dec_W_rollmean = dec_W.rolling(window=rolling_window).mean()
        rol_means['dec_W'] = dec_W_rollmean
    
    rol_means["player"] = player
    return rol_means

def get_rolling_mean(df, player, goalie):
    #Counter to create and append dataframe
    j = 0
    
    #Get the unique years included in the dataframe per player.
    df_unique_season = df[df['name']==player]
    seasons = list(df_unique_season.season.unique())
    del df_unique_season
    for season in seasons:
        if int(season) >= 2017:
            months = []
            days = []

            tmp_game_log = df[df['season'] == float(season)]
            tmp_game_log = tmp_game_log[tmp_game_log['name'] == player]
            tmp_game_log = tmp_game_log[~tmp_game_log['name'].isin([0])]
            tmp_game_log = tmp_game_log.reset_index()

            for i in np.arange(0, tmp_game_log.shape[0], 1):
                gameday = tmp_game_log.DATE[i].split("-")
                month = gameday[1]
                day = gameday[2]
                months.append(month)
                days.append(day)

            tmp_game_log["toSeason"] = tmp_game_log["season"].astype(int).astype(str) + "-" + months + "-" + days

            if j == 0:
                game_log = tmp_game_log
                j = j + 1
            else:
                game_log = game_log.append(tmp_game_log, ignore_index=False)

            #Garbage collect
            del tmp_game_log
    
    game_log.toSeason = pd.to_datetime(game_log.toSeason)
    game_log.set_index('toSeason', inplace=True)
    
    my_rolling_mean = calculate_rolling_means(game_log, goalie)
    my_rolling_mean = my_rolling_mean.iloc[-1]
    
    return my_rolling_mean

# Get player list with IDs
def get_player_list(path):
    df_id = pd.read_excel(path)
    df_id['id'] = df_id['CORRECTED_NAME'].str.strip()
    return df_id

def get_player_games(path):
    df_games = pd.read_excel(path)
    df_games['id'] = df_games['parsed_name'].str.strip()
    return df_games

def create_player_dict(df_merged):
    players = {}
    for idx, row in df_merged.iterrows():
        players[row['id']] = row['INDEX']

    return players

def getDateTime():
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")

In [8]:
df_players = get_player_list("nhl_player_list.xlsx")

In [9]:
tiers = [['Jack Eichel','Vincent Trocheck']]

In [10]:
total_my_Draft_kings_Scores = []
save_dataframe = True
data_frames = []

for tier in tiers:
    
    goalie = False
    df_players_games = pd.DataFrame(tier, columns=['id'])
    
    df_merged = pd.merge(df_players, df_players_games, how='inner', left_on=['id'], right_on=['id'])
    players = create_player_dict(df_merged)
    
    i = 0
    l = len(players)
    for player in players:
        if i == 0:
            df = get_training_data(player, players[player])
        else:
            df = df.append(get_training_data(player, players[player]))
        i += 1
        if i < l:
            time.sleep(1)

    print(getDateTime() + " - finished getting training data ...")
    
    master_df = df

    #If there are no blocks set it to Zero.
    if not goalie:
        try:
            master_df.BLK[master_df.BLK == ''] = 0
        except:
            print(getDateTime() + ' - no blocks to convert')

    #Convert data to float and calcuate DK score.        
    master_df = format_numeric_rows(df)
    master_df['DK'] = df.apply(calculate_draftkings_score, axis=1, args=(goalie,))
    master_df['TOI_CONV'] = df.apply(convert_total_on_ice, axis=1)

    #Calculate Even Strength Point, Power Play Point and Points Per Game (60 minutes)
    if not goalie:        
        master_df['ESP_PG'] = (master_df['EV_G'].astype(float) + master_df['EV_A'].astype(float)) / 60.0
        master_df['PPP_PG'] = (master_df['PP_G'].astype(float) + master_df['PP_A'].astype(float)) / 60.0
        master_df['PPG'] = (master_df.PTS/master_df.TOI_CONV) * 60
    else:
        master_df.DEC = master_df.DEC.apply(lambda x: 'N' if x == '' else x)
        dummy_dec = pd.get_dummies(master_df.DEC, prefix="dec")
        master_df = master_df.join(dummy_dec)
        master_df.drop('DEC', inplace=True)

    my_players = master_df.name.unique()
    
    if save_dataframe:
        data_frames.append(master_df)
        
    #Create Random Forest Regressor Model for DK Score for each player
    useExtra = True
    rfr_buff = None
    rfr = []

    for indx, player in enumerate(my_players):
        print(getDateTime() + " - fitting randomforest model for [" + str(player) + "] ...")

        rfr_buff = RandomForestRegressor(oob_score=True, max_features="auto",
                                        min_samples_leaf=50, n_estimators=400, 
                                        random_state=50, n_jobs=-1)

        rfr.append(rfr_buff)
        player_mask = (master_df.name == player)
        player_df = master_df[player_mask]

        #Define my target and features
        if not goalie:
            #This is the one we were using.
            y, X = patsy.dmatrices('DK ~ PPG+PTS+S+BLK+G+A', data=player_df, return_type="dataframe")
        else:
            y, X = patsy.dmatrices('DK ~ SV_PCT+GA+dec_W+dec_L', data=player_df, return_type='dataframe')
        y = np.ravel(y)
        #Fit the model with training data.
        rfr[indx].fit(X, y)

    print(getDateTime() + " - finished fitting randomforest models ...")
    
    i = 0
    todays_features = []
    for indx, player in enumerate(my_players):
        try:
            print(getDateTime() + " - getting player features for [" + player + "] ...")
            player_features = get_rolling_mean(master_df, player, goalie)
            player_features = pd.DataFrame(player_features)
            player_features = player_features.T
            if i ==0:
                todays_features = player_features
            else:
                todays_features = todays_features.append(player_features)
            i += 1
        except Exception as e: 
            print(getDateTime() + " - failed to get player features: " + str(e))

    print(getDateTime() + " - finished getting player features ...")
    
    my_Draft_kings_Scores = {"player":[],
                         "score":[]
                        }

    #Not sure if this is what we want to do if the value
    #wasn't set 
    todays_features.fillna(0, inplace=True)

    for indx, player in enumerate(my_players):
        try:
            player_mask = (todays_features.player == player)
            X_pred = todays_features[player_mask]
            X_pred.drop("player", axis=1, inplace=True)
            X_pred["Intercept"] = 1
            player_prediction = rfr[indx].predict(X_pred)
            my_Draft_kings_Scores["player"].append(player)
            my_Draft_kings_Scores["score"].append(player_prediction)
        except Exception as e: print(e)

    my_Draft_kings_Scores = pd.DataFrame(my_Draft_kings_Scores)
    
    total_my_Draft_kings_Scores.append(my_Draft_kings_Scores)
print(getDateTime() + " - FINISHED ...")

2017-12-27 13:11:35.597623 - getting training data for [Vincent Trocheck] ...
2017-12-27 13:11:40.005219 - getting training data for [Jack Eichel] ...
2017-12-27 13:11:42.473977 - finished getting training data ...
2017-12-27 13:11:42.580878 - fitting randomforest model for [Vincent Trocheck] ...
2017-12-27 13:11:42.949912 - fitting randomforest model for [Jack Eichel] ...
2017-12-27 13:11:43.314825 - finished fitting randomforest models ...
2017-12-27 13:11:43.314963 - getting player features for [Vincent Trocheck] ...
2017-12-27 13:11:43.338379 - getting player features for [Jack Eichel] ...
2017-12-27 13:11:43.357752 - finished getting player features ...
2017-12-27 13:11:43.714733 - FINISHED ...


In [11]:
make_classic = False
if make_classic:
    tmp_df = None
    for d in total_my_Draft_kings_Scores:
        tmp_df = pd.DataFrame.append(tmp_df, d)
    tmp_df
    salary = pd.read_excel("salary.xlsx")
    player_salary = pd.merge(salary, tmp_df, left_on='name', right_on='player')
    player_salary.to_excel("with_salary.xlsx")

In [12]:
for d in total_my_Draft_kings_Scores:
    print (d)

             player            score
0  Vincent Trocheck  [4.66971398305]
1       Jack Eichel  [5.39615979381]
