FIRST DRAFT: I won't think much of the markdown until I'm done.

This is my Fantasy Hockey Analyzer. The purpose of this project is to predict the fantasy hockey output of individual skaters based on stats from previous years.

In [1]:
#Import block
import os
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
#The current working directory is the main repository directory; these lines set the path to where the data is
path = os.getcwd()
data_path = path + '\\data'

#This array makes it easier to format the rotowire data
rw_labels = ["name", "Team", "Pos", "Games", "Goals", "Assists", "Pts", "+/-", "PIM", "SOG", "GWG", "PP_Goals", "PP_Assists", "SH_Goals", "SH_Assists", "Hits", "Blocked_Shots"]

#This is the breakdown of how many fantasy points a player gets for each category
points_dict = {"Goals":5, "Assists":3, "+/-":1.5, "PIM":-0.25, "PP_Goals":4, "PP_Assists":2, "SH_Goals":6, "SH_Assists":4, "Faceoffs_Won":0.25, "Faceoffs_Lost":-0.15, "Hits":0.5, "Blocked_Shots":0.75 }


The following block takes all the data we have, and transforms it into a dataframe that has, as each row, an inividual player's numbers for a given year.

In [3]:
#I have data from the 2010-2011 season through the 2021-2022 season.
#By the end of this block, there will be 12 seasons-worth of data in the "data" variable
yearly_player_data = []

for i in range(2010, 2022):
    new_data = []
    
    #Imports the rotowire and moneypuck datasets from the selected year into rdf and mdf
    rdf = pd.read_csv(data_path + '\\rotowire_data\\rotowire{}.csv'.format(str(i)))
    mdf = pd.read_csv(data_path + '\\moneypuck_data\\moneypuck{}.csv'.format(str(i)))
    
    #Formats the rotowire data
    rdf.set_axis(rw_labels, axis=1, inplace=True)
    rdf.drop(index=rdf.index[0], axis=0, inplace=True)
    
    #The Moneypuck data has information about 5-on-5, 5-on-4, 4-on-5, other, and all.
    #For this project I'm just focused on "all" since I suspect it'll give me the best results.
    mdf = mdf[mdf["situation"] == "all"]
    
    #Merges the rotowire and moneypuck dataframes
    new_data = pd.merge(rdf, mdf, on="name")
    
    #Changes the name of a few columns in the new dataframe
    new_data = new_data.rename(columns={"name":"Name","faceoffsWon":"Faceoffs_Won","faceoffsLost":"Faceoffs_Lost"})
    
    #This section calculates each player's total fantasy output for that year
    cols = new_data.columns
    fant_points = [0 for i in range(len(new_data))]
    for i in range(len(new_data)):
        for j in range(len(new_data.iloc[i])):
            mult = points_dict.get(cols[j], 0)
            if mult != 0:
                fant_points[i] += mult*int(new_data.iloc[i, j])
    
    #Adds the players' fantasy points to the new_data dataframe
    new_data["Fantasy_Points"] = fant_points
    
    #Adds new_data to the "data" array
    yearly_player_data.append(new_data)
                
    



In [4]:
yearly_player_data[3]

Unnamed: 0,Name,Team,Pos,Games,Goals,Assists,Pts,+/-,PIM,SOG,...,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,Fantasy_Points
0,Alex Ovechkin,WAS,LW,78,51,28,79,-35,48,386,...,160.95,2320.0,3437.0,0.0,0.0,0.0,0.0,0.0,0.0,523.35
1,Corey Perry,TB,RW,81,43,39,82,32,65,280,...,151.72,2862.0,3397.0,0.0,0.0,0.0,0.0,0.0,0.0,477.10
2,Joe Pavelski,DAL,RW,82,41,38,79,23,32,225,...,124.68,3336.0,3142.0,0.0,0.0,0.0,0.0,0.0,0.0,613.10
3,Max Pacioretty,CAR,LW,73,39,21,60,8,35,270,...,144.02,2446.0,3357.0,0.0,0.0,0.0,0.0,0.0,0.0,381.40
4,Phil Kessel,VGK,RW,82,37,43,80,-5,27,305,...,179.60,2429.0,3767.0,0.0,0.0,0.0,0.0,0.0,0.0,389.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,Adam Cracknell,ARI,C,19,0,2,2,0,0,16,...,37.37,898.0,837.0,0.0,0.0,0.0,0.0,0.0,0.0,28.45
861,Adam Burish,FA,RW,15,0,0,0,-4,6,9,...,28.71,876.0,657.0,0.0,0.0,0.0,0.0,0.0,0.0,15.10
862,Andrew Ebbett,FA,C,9,0,1,1,-4,0,9,...,18.54,461.0,411.0,0.0,0.0,0.0,0.0,0.0,0.0,8.00
863,Pascal Pelletier,FA,LW,3,0,0,0,0,0,1,...,6.53,169.0,139.0,0.0,0.0,0.0,0.0,0.0,0.0,2.05


At this point I have player data for each individual year. Now, I want to turn this data into data that I can put into a machine learning model. 

At this moment, I will only use data for players that have been in the league for 3 years, such that the model will be predicting fantasy output for their fourth year.

I will use the next section to turn the yearly player data into data that I want to use for ML models. First I'll make each datapoint be a three-year stretch in a player's career, such that the model will predict the fourth year's fantasy output.

The following cells generate the data that takes a player's past 3 season to predict the fantasy output for the fourth season.

In [5]:
def combine_dataframes(df1, df2, df3, df4):
    df1 = df1.add_prefix('1_')
    df2 = df1.add_prefix('2_')
    df3 = df1.add_prefix('3_')
    temp = df4[["Fantasy_Points"]]
    return pd.concat([df1, df2, df3, temp], axis=1)

In [6]:
ml_data = pd.DataFrame()
for i in range(2013, 2022):
    t1 = yearly_player_data[i-2013]
    t2 = yearly_player_data[i-2012]
    t3 = yearly_player_data[i-2011]
    t4 = yearly_player_data[i-2010]
    #print(combine_dataframes(t1,t2,t3,t4))
    ml_data = pd.concat([ml_data, combine_dataframes(t1,t2,t3,t4)], ignore_index=True)
    
ml_data = ml_data.dropna(axis=0)

In [7]:
def separate_fantasy_points(df):
    fantasy_points = df['Fantasy_Points'].tolist()
    df = df.drop(columns=['Fantasy_Points'])
    return [df, fantasy_points]

In [8]:
def reformat_df(df):
    new_df = df.copy() # Make a copy of the input dataframe
    
    # Iterate over the columns of the dataframe
    for col in new_df.columns:
        # Check if the column name contains the substring "Name"
        if "Name" in col:
            # If it does, replace all the string values in that column with 0
            new_df[col] = new_df[col].apply(lambda x: 0 if isinstance(x, str) else x)
        else:
            # If it doesn't, replace all the string values in that column with integers
            new_df[col] = pd.Categorical(new_df[col]).codes
            
    return new_df


In [9]:
def evaluate_regr(regr, X_test, y_test):
    # Use the regr object to make predictions on the X_test data
    y_pred = regr.predict(X_test)
    
    # Calculate the mean absolute error between the predicted values and the true values
    mae = mean_absolute_error(y_test, y_pred)
    
    # Calculate the absolute errors for each prediction
    errors = [abs(y_pred[i] - y_test[i]) for i in range(len(y_pred))]
    
    # Calculate the standard deviation of the absolute errors
    mae_stddev = np.std(errors)
    
    return [mae, mae_stddev]


In [10]:
arr = separate_fantasy_points(ml_data)
X = reformat_df(arr[0])
y = arr[1]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
regr = MLPRegressor(max_iter=500).fit(X, y)

In [13]:
lst = evaluate_regr(regr, X_test, y_test)
print(lst)

[62.62362696646622, 52.71664804619927]


In [14]:
def combine_dataframes2(df1, df2, df3):
    df1 = df1.add_prefix('1_')
    df2 = df1.add_prefix('2_')
    df3 = df1.add_prefix('3_')
    return pd.concat([df1, df2, df3], axis=1)

In [15]:
t1 = yearly_player_data[9]
t2 = yearly_player_data[10]
t3 = yearly_player_data[11]
new_data = combine_dataframes2(t1, t2, t3)


In [16]:
def get_predictions(regr, df):
    # Make a copy of the input dataframe
    new_df = df.copy()
    
    # Save the values in the "1_Name" column before reformatting
    names = new_df["1_Name"].values
    
    # Use the reformat_df function to reformat the dataframe
    new_df = reformat_df(new_df)
    
    # Use the regr object to make predictions on the reformatted dataframe
    predictions = regr.predict(new_df)
    
    # Create a new dataframe with the names and predictions
    result_df = pd.DataFrame({"Name": names, "Prediction": predictions})
    
    return result_df


In [17]:
final_df = get_predictions(regr, new_data).sort_values(by="Prediction", ascending=False)


In [18]:
pd.options.display.max_rows = 862
display(final_df)

Unnamed: 0,Name,Prediction
1,David Pastrnak,206.198334
23,Elias Pettersson,181.836644
0,Alex Ovechkin,180.611721
10,Nikita Kucherov,177.180272
12,Max Pacioretty,176.59881
20,Mike Hoffman,175.743027
39,Evgenii Dadonov,162.649259
2,Auston Matthews,161.742909
13,Artemi Panarin,160.076872
35,Evgeni Malkin,159.981858


BELOW THIS IS THE ONE_YEAR DATA

In [19]:
def merge_dataframe(df1, df2):
    df1 = df1.merge(df2[['Name','Fantasy_Points']], on='Name')
    df1.drop(columns=["Fantasy_Points_x"], inplace=True)
    df1.rename(columns={'Fantasy_Points_y': 'Fantasy_Points'}, inplace=True)
    return df1

In [20]:
ml_data_one_year = pd.DataFrame()
for i in range(2011, 2022):
    df1 = yearly_player_data[i-2011]
    df2 = yearly_player_data[i-2010]
    temp = merge_dataframe(df1, df2)
    
    ml_data_one_year = pd.concat([ml_data_one_year, temp], ignore_index=True)
    


In [21]:
arr = separate_fantasy_points(ml_data_one_year)
X = reformat_df(arr[0])
y = arr[1]


In [22]:
regr = MLPRegressor(max_iter=500).fit(X, y)

In [23]:
def get_predictions2(regr, df):
    # Make a copy of the input dataframe
    new_df = df.copy()
    
    # Save the values in the "1_Name" column before reformatting
    names = new_df["Name"].values
    
    # Use the reformat_df function to reformat the dataframe
    new_df = reformat_df(new_df)
    
    # Use the regr object to make predictions on the reformatted dataframe
    predictions = regr.predict(new_df)
    
    # Create a new dataframe with the names and predictions
    result_df = pd.DataFrame({"Name": names, "Prediction": predictions})
    
    return result_df


In [24]:
new_data_2 = yearly_player_data[11].copy()
new_data_2.drop(columns=["Fantasy_Points"], inplace=True)
final_df = get_predictions2(regr, new_data_2)
final_df.sort_values(by="Prediction", ascending=False, inplace=True)
pd.options.display.max_rows = 978
display(final_df)

Unnamed: 0,Name,Prediction
65,Auston Matthews,711.490286
53,Nathan MacKinnon,672.406728
10,Aleksander Barkov,652.583407
100,J.T. Miller,635.304201
33,Patrice Bergeron,632.908238
122,Nazem Kadri,629.808334
1,Elias Lindholm,626.266724
241,Bo Horvat,620.294105
79,Sebastian Aho,618.456803
71,Sidney Crosby,611.769365
