FIRST DRAFT: I won't think much of the markdown until I'm done.

This is my Fantasy Hockey Analyzer. The purpose of this project is to predict the fantasy hockey output of individual skaters based on stats from previous years.

In [1]:
#Import block
import os
import numpy as np
import pandas as pd
import my_module as mx
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [2]:
#The current working directory is the main repository directory; these lines set the path to where the data is
path = os.getcwd()
data_path = path + '\\data'

#This array makes it easier to format the rotowire data
rw_labels = ["name", "Team", "Pos", "Games", "Goals", "Assists", "Pts", "+/-", "PIM", "SOG", "GWG", "PP_Goals", "PP_Assists", "SH_Goals", "SH_Assists", "Hits", "Blocked_Shots"]

#This is the breakdown of how many fantasy points a player gets for each category
points_dict = {"Goals":5, "Assists":3, "+/-":1.5, "PIM":-0.25, "PP_Goals":4, "PP_Assists":2, "SH_Goals":6, "SH_Assists":4, "Faceoffs_Won":0.25, "Faceoffs_Lost":-0.15, "Hits":0.5, "Blocked_Shots":0.75 }


The following block takes all the data we have, and transforms it into a dataframe that has, as each row, an inividual player's numbers for a given year.

In [3]:
#I have data from the 2010-2011 season through the 2021-2022 season.
#By the end of this block, there will be 12 seasons-worth of data in the "data" variable
yearly_player_data = []

for i in range(2010, 2022):
    new_data = []
    
    #Imports the rotowire and moneypuck datasets from the selected year into rdf and mdf
    rdf = pd.read_csv(data_path + '\\rotowire_data\\rotowire{}.csv'.format(str(i)))
    mdf = pd.read_csv(data_path + '\\moneypuck_data\\moneypuck{}.csv'.format(str(i)))
    
    #Formats the rotowire data
    rdf.set_axis(rw_labels, axis=1, inplace=True)
    rdf.drop(index=rdf.index[0], axis=0, inplace=True)
    
    #The Moneypuck data has information about 5-on-5, 5-on-4, 4-on-5, other, and all.
    #For this project I'm just focused on "all" since I suspect it'll give me the best results.
    mdf = mdf[mdf["situation"] == "all"]
    
    #Merges the rotowire and moneypuck dataframes
    new_data = pd.merge(rdf, mdf, on="name")
    
    #Changes the name of a few columns in the new dataframe
    new_data = new_data.rename(columns={"name":"Name","faceoffsWon":"Faceoffs_Won","faceoffsLost":"Faceoffs_Lost"})
    
    #This section calculates each player's total fantasy output for that year
    cols = new_data.columns
    fant_points = [0 for i in range(len(new_data))]
    for i in range(len(new_data)):
        for j in range(len(new_data.iloc[i])):
            mult = points_dict.get(cols[j], 0)
            if mult != 0:
                fant_points[i] += mult*int(new_data.iloc[i, j])
    
    #Adds the players' fantasy points to the new_data dataframe
    new_data["Fantasy_Points"] = fant_points
    
    #Adds new_data to the "data" array
    yearly_player_data.append(new_data)
                
    



In [None]:
yearly_player_data[3]

At this point I have player data for each individual year. Now, I want to turn this data into data that I can put into a machine learning model. 

At this moment, I will only use data for players that have been in the league for 3 years, such that the model will be predicting fantasy output for their fourth year.

I will use the next section to turn the yearly player data into data that I want to use for ML models. First I'll make each datapoint be a three-year stretch in a player's career, such that the model will predict the fourth year's fantasy output.

The following cells generate the data that takes a player's past 3 season to predict the fantasy output for the fourth season.

In [None]:
def combine_dataframes(df1, df2, df3, df4):
    df_merged = combine_dataframes2(df1, df2, df3)
    df_merged = pd.merge(df_merged, df4[["Name", "Fantasy_Points"]], 
                         left_on='1_Name', right_on="Name")
    return df_merged

def combine_dataframes2(df1, df2, df3):
    # Add prefix "1_" to column names of first dataframe
    df1 = df1.add_prefix("1_")
    # Add prefix "2_" to column names of second dataframe
    df2 = df2.add_prefix("2_")
    # Add prefix "3_" to column names of third dataframe
    df3 = df3.add_prefix("3_")
    # Merge dataframes on "Name" column
    df_merged = pd.merge(df1, df2, left_on='1_Name', right_on="2_Name")
    df_merged = pd.merge(df_merged, df3, left_on='1_Name', right_on="3_Name")
    return df_merged

def separate_fantasy_points(df):
    fantasy_points = df['Fantasy_Points'].tolist()
    df = df.drop(columns=['Fantasy_Points'])
    return [df, fantasy_points]


def reformat_df(df):
    new_df = df.copy() # Make a copy of the input dataframe
    
    # Iterate over the columns of the dataframe
    for col in new_df.columns:
        # Check if the column name contains the substring "Name"
        if "Name" in col:
            # If it does, replace all the string values in that column with 0
            new_df[col] = new_df[col].apply(lambda x: 0 if isinstance(x, str) else x)
        else:
            # If it doesn't, replace all the string values in that column with integers
            new_df[col] = pd.Categorical(new_df[col]).codes
            
    return new_df

def evaluate_regr(regr, X_test, y_test):
    # Use the regr object to make predictions on the X_test data
    y_pred = regr.predict(X_test)
    
    # Calculate the mean absolute error between the predicted values and the true values
    mae = mean_absolute_error(y_test, y_pred)
    
    # Calculate the absolute errors for each prediction
    errors = [abs(y_pred[i] - y_test[i]) for i in range(len(y_pred))]
    
    # Calculate the standard deviation of the absolute errors
    mae_stddev = np.std(errors)
    
    return [mae, mae_stddev]




def get_predictions(regr, df):
    # Make a copy of the input dataframe
    new_df = df.copy()
    
    # Save the values in the "1_Name" column before reformatting
    names = new_df["1_Name"].values
    
    # Use the reformat_df function to reformat the dataframe
    new_df = reformat_df(new_df)
    
    # Use the regr object to make predictions on the reformatted dataframe
    predictions = regr.predict(new_df)
    
    # Create a new dataframe with the names and predictions
    result_df = pd.DataFrame({"Name": names, "Prediction": predictions})
    
    return result_df


In [None]:
ml_data = pd.DataFrame()
for i in range(2013, 2022):
    t1 = yearly_player_data[i-2013]
    t2 = yearly_player_data[i-2012]
    t3 = yearly_player_data[i-2011]
    t4 = yearly_player_data[i-2010]
    #print(combine_dataframes(t1,t2,t3,t4))
    ml_data = pd.concat([ml_data, combine_dataframes(t1,t2,t3,t4)], ignore_index=True)
    
ml_data = ml_data.dropna(axis=0).drop("Name", axis=1)

In [None]:
ml_data

In [None]:
arr = separate_fantasy_points(ml_data)
X = reformat_df(arr[0])
y = arr[1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
regr = MLPRegressor(max_iter=500).fit(X, y)

In [None]:
lst = evaluate_regr(regr, X_test, y_test)
print(lst)

In [None]:
t1 = yearly_player_data[9]
t2 = yearly_player_data[10]
t3 = yearly_player_data[11]
new_data = combine_dataframes2(t1, t2, t3)


In [None]:
final_df = get_predictions(regr, new_data).sort_values(by="Prediction", ascending=False)


In [None]:
pd.options.display.max_rows = 862
display(final_df)

BELOW THIS IS THE ONE_YEAR DATA

In [4]:
ml_data_one_year = pd.DataFrame()
for i in range(2011, 2022):
    
    arr = [yearly_player_data[i-2011]]
    points_df = yearly_player_data[i-2010]
    temp = mx.merge_dataframes(arr, points_df)
    ml_data_one_year = pd.concat([ml_data_one_year, temp], ignore_index=True)
    
ml_data_one_year

Unnamed: 0,Name,Team,Pos,Games,Goals,Assists,Pts,+/-,PIM,SOG,...,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,Predicted_Fantasy_Points
0,Corey Perry,TB,RW,82,50,48,98,9,104,290,...,157.84,2297.0,3382.0,0.0,0.0,0.0,0.0,0.0,0.0,364.40
1,Steven Stamkos,TB,C,82,45,46,91,3,74,272,...,141.94,2810.0,3186.0,0.0,0.0,0.0,0.0,0.0,0.0,600.40
2,Jarome Iginla,FA,RW,82,43,43,86,0,40,289,...,139.34,2911.0,3132.0,0.0,0.0,0.0,0.0,0.0,0.0,409.20
3,Daniel Sedin,FA,LW,82,41,63,104,30,32,266,...,161.83,2983.0,3394.0,0.0,0.0,0.0,0.0,0.0,0.0,363.30
4,Ryan Kesler,FA,C,82,41,32,73,24,66,260,...,143.17,2855.0,3038.0,0.0,0.0,0.0,0.0,0.0,0.0,442.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7924,Cam York,PHI,D,3,0,0,0,-1,0,1,...,5.29,124.0,103.0,0.0,0.0,0.0,0.0,0.0,0.0,63.50
7925,Alex Newhook,COL,C,6,0,3,3,5,2,8,...,11.37,300.0,217.0,0.0,0.0,0.0,0.0,0.0,0.0,199.10
7926,Peyton Krebs,BUF,C,4,0,1,1,1,0,1,...,8.85,213.0,177.0,0.0,0.0,0.0,0.0,0.0,0.0,72.45
7927,Ville Heinola,WPG,D,5,0,0,0,-2,2,9,...,10.71,215.0,201.0,0.0,0.0,0.0,0.0,0.0,0.0,19.25


In [5]:
arr = mx.separate_fantasy_points(ml_data_one_year)
X = mx.reformat_df(arr[0])
y = arr[1]


In [13]:
regr = MLPRegressor(max_iter=500).fit(X, y)

In [14]:
def get_predictions2(regr, df):
    # Make a copy of the input dataframe
    new_df = df.copy()
    
    # Save the values in the "1_Name" column before reformatting
    names = new_df["Name"].values
    
    # Use the reformat_df function to reformat the dataframe
    new_df = mx.reformat_df(new_df)
    
    # Use the regr object to make predictions on the reformatted dataframe
    predictions = regr.predict(new_df)
    
    # Create a new dataframe with the names and predictions
    result_df = pd.DataFrame({"Name": names, "Prediction": predictions})
    
    return result_df


In [15]:
new_data_2 = yearly_player_data[11].copy()
new_data_2.drop(columns=["Fantasy_Points"], inplace=True)
final_df = get_predictions2(regr, new_data_2)
final_df.sort_values(by="Prediction", ascending=False, inplace=True)
pd.options.display.max_rows = 978
display(final_df)

Unnamed: 0,Name,Prediction
100,J.T. Miller,441.091717
571,Timo Meier,422.270036
772,Tomas Hertl,421.698868
65,Auston Matthews,415.806901
26,Connor McDavid,414.452372
4,Cale Makar,413.365307
727,Brady Tkachuk,413.318824
86,Leon Draisaitl,413.009406
784,Moritz Seider,405.525584
92,Joel Eriksson Ek,399.022359


In [17]:
lalala = [yearly_player_data[i].get(1) for i in range(len(yearly_player_data))]

IndexError: list index out of range