FIRST DRAFT: I won't think much of the markdown until I'm done.

This is my Fantasy Hockey Analyzer. The purpose of this project is to predict the fantasy hockey output of individual skaters based on stats from previous years.

In [31]:
#Import block
import os
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [32]:
#The current working directory is the main repository directory; these lines set the path to where the data is
path = os.getcwd()
data_path = path + '\\data'

#This array makes it easier to format the rotowire data
rw_labels = ["name", "Team", "Pos", "Games", "Goals", "Assists", "Pts", "+/-", "PIM", "SOG", "GWG", "PP_Goals", "PP_Assists", "SH_Goals", "SH_Assists", "Hits", "Blocked_Shots"]

#This is the breakdown of how many fantasy points a player gets for each category
points_dict = {"Goals":5, "Assists":3, "+/-":1.5, "PIM":-0.25, "PP_Goals":4, "PP_Assists":2, "SH_Goals":6, "SH_Assists":4, "Faceoffs_Won":0.25, "Faceoffs_Lost":-0.15, "Hits":0.5, "Blocked_Shots":0.75 }


In [33]:
#I have data from the 2010-2011 season through the 2021-2022 season.
#By the end of this block, there will be 12 seasons-worth of data in the "data" variable
yearly_player_data = []

for i in range(2010, 2022):
    new_data = []
    
    #Imports the rotowire and moneypuck datasets from the selected year into rdf and mdf
    rdf = pd.read_csv(data_path + '\\rotowire_data\\rotowire{}.csv'.format(str(i)))
    mdf = pd.read_csv(data_path + '\\moneypuck_data\\moneypuck{}.csv'.format(str(i)))
    
    #Formats the rotowire data
    rdf.set_axis(rw_labels, axis=1, inplace=True)
    rdf.drop(index=rdf.index[0], axis=0, inplace=True)
    
    #The Moneypuck data has information about 5-on-5, 5-on-4, 4-on-5, other, and all.
    #For this project I'm just focused on "all" since I suspect it'll give me the best results.
    mdf = mdf[mdf["situation"] == "all"]
    
    #Merges the rotowire and moneypuck dataframes
    new_data = pd.merge(rdf, mdf, on="name")
    
    #Changes the name of a few columns in the new dataframe
    new_data = new_data.rename(columns={"name":"Name","faceoffsWon":"Faceoffs_Won","faceoffsLost":"Faceoffs_Lost"})
    
    #This section calculates each player's total fantasy output for that year
    cols = new_data.columns
    fant_points = [0 for i in range(len(new_data))]
    for i in range(len(new_data)):
        for j in range(len(new_data.iloc[i])):
            mult = points_dict.get(cols[j], 0)
            if mult != 0:
                fant_points[i] += mult*int(new_data.iloc[i, j])
    
    #Adds the players' fantasy points to the new_data dataframe
    new_data["Fantasy_Points"] = fant_points
    
    #Adds new_data to the "data" array
    yearly_player_data.append(new_data)
                
    



At this point I have player data for each individual year. Now, I want to turn this data into data that I can put into a machine learning model. 

At this moment, I will only use data for players that have been in the league for 3 years, such that the model will be predicting fantasy output for their fourth year.

In [34]:
def combine_dataframes(df1, df2, df3, df4):
    df1 = df1.add_prefix('1_')
    df2 = df1.add_prefix('2_')
    df3 = df1.add_prefix('3_')
    temp = df4[["Fantasy_Points"]]
    return pd.concat([df1, df2, df3, temp], axis=1)

In [35]:
ml_data = pd.DataFrame()
for i in range(2013, 2022):
    t1 = yearly_player_data[i-2013]
    t2 = yearly_player_data[i-2012]
    t3 = yearly_player_data[i-2011]
    t4 = yearly_player_data[i-2010]
    #print(combine_dataframes(t1,t2,t3,t4))
    ml_data = pd.concat([ml_data, combine_dataframes(t1,t2,t3,t4)], ignore_index=True)
    
ml_data = ml_data.dropna(axis=0)

In [36]:
ml_data

Unnamed: 0,1_Name,1_Team,1_Pos,1_Games,1_Goals,1_Assists,1_Pts,1_+/-,1_PIM,1_SOG,...,3_1_OffIce_F_shotAttempts,3_1_OffIce_A_shotAttempts,3_1_xGoalsForAfterShifts,3_1_xGoalsAgainstAfterShifts,3_1_corsiForAfterShifts,3_1_corsiAgainstAfterShifts,3_1_fenwickForAfterShifts,3_1_fenwickAgainstAfterShifts,3_1_Fantasy_Points,Fantasy_Points
0,Corey Perry,TB,RW,82,50,48,98,9,104,290,...,2297.0,3382.0,0.0,0.0,0.0,0.0,0.0,0.0,562.55,523.35
1,Steven Stamkos,TB,C,82,45,46,91,3,74,272,...,2810.0,3186.0,0.0,0.0,0.0,0.0,0.0,0.0,558.10,477.10
2,Jarome Iginla,FA,RW,82,43,43,86,0,40,289,...,2911.0,3132.0,0.0,0.0,0.0,0.0,0.0,0.0,522.30,613.10
3,Daniel Sedin,FA,LW,82,41,63,104,30,32,266,...,2983.0,3394.0,0.0,0.0,0.0,0.0,0.0,0.0,565.55,381.40
4,Ryan Kesler,FA,C,82,41,32,73,24,66,260,...,2855.0,3038.0,0.0,0.0,0.0,0.0,0.0,0.0,673.70,389.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7871,Brandon Davidson,FA,D,10,0,1,1,-4,15,7,...,458.0,463.0,0.0,0.0,0.0,0.0,0.0,0.0,13.00,285.70
7872,Anthony Bitetto,FLA,D,36,0,3,3,-7,12,30,...,1771.0,1521.0,0.0,0.0,0.0,0.0,0.0,0.0,62.50,270.60
7873,Kevin Gravel,NSH,D,36,0,3,3,-2,4,44,...,1511.0,1543.0,0.0,0.0,0.0,0.0,0.0,0.0,60.25,265.75
7874,Stefan Elliott,FA,D,3,0,1,1,1,0,3,...,130.0,181.0,0.0,0.0,0.0,0.0,0.0,0.0,6.25,336.85


In [37]:
def separate_fantasy_points(df):
    fantasy_points = df['Fantasy_Points'].tolist()
    df = df.drop(columns=['Fantasy_Points'])
    return [df, fantasy_points]

In [38]:
def reformat_df(df):
    new_df = df.copy() # Make a copy of the input dataframe
    
    # Iterate over the columns of the dataframe
    for col in new_df.columns:
        # Check if the column name contains the substring "Name"
        if "Name" in col:
            # If it does, replace all the string values in that column with 0
            new_df[col] = new_df[col].apply(lambda x: 0 if isinstance(x, str) else x)
        else:
            # If it doesn't, replace all the string values in that column with integers
            new_df[col] = pd.Categorical(new_df[col]).codes
            
    return new_df


In [39]:
def evaluate_regr(regr, X_test, y_test):
    # Use the regr object to make predictions on the X_test data
    y_pred = regr.predict(X_test)
    
    # Calculate the mean absolute error between the predicted values and the true values
    mae = mean_absolute_error(y_test, y_pred)
    
    # Calculate the absolute errors for each prediction
    errors = [abs(y_pred[i] - y_test[i]) for i in range(len(y_pred))]
    
    # Calculate the standard deviation of the absolute errors
    mae_stddev = np.std(errors)
    
    return [mae, mae_stddev]


In [40]:
arr = separate_fantasy_points(ml_data)
X = reformat_df(arr[0])
y = arr[1]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [42]:
regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)

In [43]:
lst = evaluate_regr(regr, X_test, y_test)
print(lst)

[63.534850054397836, 57.010649362350236]


In [44]:
def combine_dataframes2(df1, df2, df3):
    df1 = df1.add_prefix('1_')
    df2 = df1.add_prefix('2_')
    df3 = df1.add_prefix('3_')
    return pd.concat([df1, df2, df3], axis=1)

In [47]:
t1 = yearly_player_data[9]
t2 = yearly_player_data[10]
t3 = yearly_player_data[11]
new_data = combine_dataframes2(t1, t2, t3)


In [48]:
new_data

Unnamed: 0,1_Name,1_Team,1_Pos,1_Games,1_Goals,1_Assists,1_Pts,1_+/-,1_PIM,1_SOG,...,3_1_OffIce_A_xGoals,3_1_OffIce_F_shotAttempts,3_1_OffIce_A_shotAttempts,3_1_xGoalsForAfterShifts,3_1_xGoalsAgainstAfterShifts,3_1_corsiForAfterShifts,3_1_corsiAgainstAfterShifts,3_1_fenwickForAfterShifts,3_1_fenwickAgainstAfterShifts,3_1_Fantasy_Points
0,Alex Ovechkin,WAS,LW,68,48,19,67,-12,30,311,...,126.99,2302.0,2620.0,0.0,0.0,0.0,0.0,0.0,0.0,448.25
1,David Pastrnak,BOS,RW,70,48,47,95,21,40,279,...,135.66,2403.0,2810.0,0.0,0.0,0.0,0.0,0.0,0.0,558.60
2,Auston Matthews,TOR,C,70,47,33,80,19,8,290,...,137.95,2510.0,2773.0,0.0,0.0,0.0,0.0,0.0,0.0,562.60
3,Leon Draisaitl,EDM,C,71,43,67,110,-7,18,218,...,119.63,2100.0,2693.0,0.0,0.0,0.0,0.0,0.0,0.0,623.25
4,Mika Zibanejad,NYR,C,57,41,34,75,9,14,208,...,103.80,1887.0,2373.0,0.0,0.0,0.0,0.0,0.0,0.0,545.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,Timothy Liljegren,TOR,D,11,0,1,1,-5,2,2,...,26.83,538.0,525.0,0.0,0.0,0.0,0.0,0.0,0.0,6.50
858,Urho Vaakanainen,ANH,D,5,0,0,0,2,0,2,...,10.97,208.0,212.0,0.0,0.0,0.0,0.0,0.0,0.0,9.25
859,Josh Norris,OTT,C,3,0,0,0,2,0,9,...,6.01,138.0,137.0,0.0,0.0,0.0,0.0,0.0,0.0,10.65
860,Gage Quinney,FA,LW,3,0,1,1,-1,0,2,...,7.70,158.0,167.0,0.0,0.0,0.0,0.0,0.0,0.0,3.15


In [49]:
def get_predictions(regr, df):
    # Make a copy of the input dataframe
    new_df = df.copy()
    
    # Save the values in the "1_Name" column before reformatting
    names = new_df["1_Name"].values
    
    # Use the reformat_df function to reformat the dataframe
    new_df = reformat_df(new_df)
    
    # Use the regr object to make predictions on the reformatted dataframe
    predictions = regr.predict(new_df)
    
    # Create a new dataframe with the names and predictions
    result_df = pd.DataFrame({"Name": names, "Prediction": predictions})
    
    return result_df


In [56]:
final_df = get_predictions(regr, new_data).sort_values(by="Prediction", ascending=False)


In [57]:
pd.options.display.max_rows = 862
display(final_df)

Unnamed: 0,Name,Prediction
262,Garnet Hathaway,192.911742
94,James van Riemsdyk,185.743622
162,Ryan Donato,183.147219
62,Conor Garland,178.652765
40,Nikolaj Ehlers,178.108655
15,William Nylander,177.238129
5,Kyle Connor,175.804789
90,Roope Hintz,171.905722
50,Kevin Fiala,169.792107
83,Denis Gurianov,169.565703
