In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import OneHotEncoder
import math

In [2]:
def calculate_strength(df):
    Skaters = []
    for i in range(len(df)):
        state = '?'
        if df.loc[i,'Team'] == df.loc[i,'Home Team']:
            state = str(df.loc[i,'Home Team Skaters']) + "v" + str(df.loc[i,'Away Team Skaters'])
        elif df.loc[i,'Team'] == df.loc[i,'Away Team']:
            state = str(df.loc[i,'Away Team Skaters']) + "v" + str(df.loc[i,'Home Team Skaters'])
        Skaters.append(state)
    return Skaters

def calculate_strength_general(df):
    states = []
    for i in range(len(df)):
        state = '?'
        if df.loc[i,'Team'] == df.loc[i,'Home Team']:
            if df.loc[i,'Home Team Skaters'] == df.loc[i,'Away Team Skaters']:
                state = "even strength"
            elif df.loc[i,'Home Team Skaters'] > df.loc[i,'Away Team Skaters']:
                state = "power play"
            else:
                state = "shorthanded"
        elif df.loc[i,'Team'] == df.loc[i,'Away Team']:
            if df.loc[i,'Away Team Skaters'] == df.loc[i,'Home Team Skaters']:
                state = "even strength"
            elif df.loc[i,'Away Team Skaters'] > df.loc[i,'Home Team Skaters']:
                state = "power play"
            else:
                state = "shorthanded"
        states.append(state)
    return states

In [3]:
def get_shot_dist_and_angle(df):
    Shot_distance = []
    Shot_angle = []
    for i in range(len(df)):
        sd = np.nan
        sa = np.nan
        if df.loc[i,'Event'] in ['Shot', 'Goal']:
            sd = ((df.loc[i,'X Coordinate'] - 189) ** 2 + (df.loc[i,'Y Coordinate'] - 42.5) ** 2)**0.5
            sa = np.arctan(abs(df.loc[i,'Y Coordinate'] - 42.5)/(189 - (df.loc[i,'X Coordinate'])))
            sa = sa * 180/np.pi
        Shot_distance.append(sd)
        Shot_angle.append(sa)
    df['Shot Angle'] = Shot_angle
    df['Shot Distance'] = Shot_distance
    return df

In [4]:
#xG features
#Shot Distance 'Shot Distance'
#Time since last event* 'Time Since Last'
#Shot Type 'Detail 1'
#Shot Angle 'Shot Angle'
#East/west mvmt from last event* 'Lateral Since Last'
#Rebound angle* ?
#Other team skaters* 'Opp Team Skaters'
#Is powerplay*
#print(df.columns)
#df.loc[df.Event == 'Shot']['Detail 1']

def find_diff_bt_events(df):
    elapsed_since_last = [np.nan]
    east_west_since_last = [np.nan]
    last_event = [np.nan]
    for j in range(1,len(df)):
        i = j-1
        time_delta = int(df.loc[i,'Clock'].split(':')[0]) * 60 + int(df.loc[i,'Clock'].split(':')[1]) \
                         - int(df.loc[j,'Clock'].split(':')[0]) * 60 - int(df.loc[j,'Clock'].split(':')[1])
        elapsed_since_last.append(time_delta)
        lateral_delta = abs(df.loc[i,'Y Coordinate'] - df.loc[j,'Y Coordinate'])
        last_event.append(df.loc[i,'Event'])
        east_west_since_last.append(lateral_delta)
    return elapsed_since_last, east_west_since_last, last_event

def find_time_bt_events(df):
    elapsed_since_last = [np.nan]
    for j in range(1,len(df)):
        i = j-1
        time_delta = int(df.loc[i,'Clock'].split(':')[0]) * 60 + int(df.loc[i,'Clock'].split(':')[1]) \
                         - int(df.loc[j,'Clock'].split(':')[0]) * 60 - int(df.loc[j,'Clock'].split(':')[1])
        elapsed_since_last.append(time_delta)
    return elapsed_since_last

def find_opp_team_skaters(df):
    opp_team_skaters = []
    for i in range(len(df)):
        if df.loc[i,'Team'] == df.loc[i,'Away Team']:
            opp_team_skaters = df.loc[i,'Home Team Skaters']
        else:
            opp_team_skaters = df.loc[i,'Away Team Skaters']
    return opp_team_skaters

In [5]:
def get_xG_inputs(df):
    #Use functions to get data about the prior event
    time_d, lat_d, lasts = find_diff_bt_events(df)
    df['Time Since Last'] = time_d
    df['Last Event Type'] = lasts
    df['Lateral Since Last'] = lat_d
    df['Opp Team Skaters'] = find_opp_team_skaters(df)
    #More categorical variables and target
    df['Is Powerplay'] = [1 if x == 'power play' else 0 for x in df['Strength'].tolist()]
    df['Is Goal'] = [1 if x == 'Goal' else 0 for x in df['Event'].tolist()]
    #Trim to only shots
    shot_df = df.loc[df.Event.isin(["Goal", "Shot"])]
    indices = shot_df.index.tolist()
    shot_df.reset_index(inplace=True)
    #Categorical variables for prior event type
    enc = OneHotEncoder()
    x = enc.fit_transform(shot_df[['Last Event Type']])
    headers = (enc.categories_)
    headers = (['Last Was ' + str(x) for x in headers[0]])
    rows = enc.transform(shot_df[['Last Event Type']]).toarray()
    cat_data = pd.DataFrame(data = rows, columns = headers)
    cat_data
    for h in headers:
        shot_df[h] = cat_data[h]
    #Categorical variables for shot type
    enc2 = OneHotEncoder()
    x = enc2.fit_transform(shot_df[['Detail 1']])
    headers = (enc2.categories_)
    headers = ([str(x) for x in headers[0]])
    rows = enc2.transform(shot_df[['Detail 1']]).toarray()
    cat_data = pd.DataFrame(data = rows, columns = headers)
    cat_data
    for h in headers:
        shot_df[h] = cat_data[h]
    #Select and return xG model inputs and target
    inputs = shot_df.reset_index()[['X Coordinate', 'Y Coordinate', 'Shot Angle', 'Shot Distance', 'Time Since Last',
       'Lateral Since Last', 'Opp Team Skaters', 'Is Powerplay', \
       #'Last Was Play', <- omitted
       'Last Was Puck Recovery', \
       'Last Was Shot', 'Last Was Takeaway', 'Last Was Zone Entry', \
       'Deflection', 'Fan', 'Slapshot', 'Snapshot', 'Wrap Around', \
       #'Wristshot' <- omitted
       ]]

    labels = shot_df.reset_index()['Is Goal']
    return inputs, labels, indices

In [6]:
def train_xg_model(x,y):
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    kf.get_n_splits(x)
    clf = LogisticRegressionCV(#Cs=list(np.power(10.0, np.arange(-10, 10))), penalty = 'l2', \
                               scoring='roc_auc', cv=kf,random_state=0,max_iter=10000,fit_intercept=True \
                               ,solver='newton-cg', tol=10)
    clf.fit(x, y)
    #print(clf.score(inputs,labels))
    coefs = pd.DataFrame(list(zip(np.array(x.columns),clf.coef_.T)), \
                   columns = ['Variable','Coef']).sort_values(['Coef'], ascending=False)
    print(coefs)
    return clf

In [7]:
def shot_feature_adder(df):
    nwhl = df
    dfw = get_shot_dist_and_angle(nwhl)
    dfw['Skaters'] = calculate_strength(dfw)
    dfw['Strength'] = calculate_strength_general(dfw)
    w_inp, w_lab, w_ind = get_xG_inputs(dfw)
    xgw = train_xg_model(w_inp, w_lab)
    w_pred = xgw.predict_proba(w_inp)[:,1]
    dfw['xG'] = [np.nan for i in range(len(dfw))]
    for i in range(len(w_ind)):
        dfw.loc[w_ind[i],'xG'] = w_pred[i]
    return dfw

In [9]:
def clocktime(clock):
    time = int(clock.split(':')[0]) * 60 + int(clock.split(':')[1])
    return time

def dist(a,b):
    runtot = 0
    for i in range(len(a)):
        runtot += (a[i] - b[i]) ** 2
    return runtot ** 0.5