FIRST DRAFT: I won't think much of the markdown until I'm done.

This is my Fantasy Hockey Analyzer. The purpose of this project is to predict the fantasy hockey output of individual skaters based on stats from previous years.

In [1]:
#Import block
import os
import numpy as np
import pandas as pd

In [2]:
#The current working directory is the main repository directory; these lines set the path to where the data is
path = os.getcwd()
data_path = path + '\\data'

#This array makes it easier to format the rotowire data
rw_labels = ["name", "Team", "Pos", "Games", "Goals", "Assists", "Pts", "+/-", "PIM", "SOG", "GWG", "PP_Goals", "PP_Assists", "SH_Goals", "SH_Assists", "Hits", "Blocked_Shots"]

#This is the breakdown of how many fantasy points a player gets for each category
points_dict = {"Goals":5, "Assists":3, "+/-":1.5, "PIM":-0.25, "PP_Goals":4, "PP_Assists":2, "SH_Goals":6, "SH_Assists":4, "Faceoffs_Won":0.25, "Faceoffs_Lost":-0.15, "Hits":0.5, "Blocked_Shots":0.75 }


In [3]:
#I have data from the 2010-2011 season through the 2021-2022 season.
#By the end of this block, there will be 12 seasons-worth of data in the "data" variable
yearly_player_data = []

for i in range(2010, 2022):
    new_data = []
    
    #Imports the rotowire and moneypuck datasets from the selected year into rdf and mdf
    rdf = pd.read_csv(data_path + '\\rotowire_data\\rotowire{}.csv'.format(str(i)))
    mdf = pd.read_csv(data_path + '\\moneypuck_data\\moneypuck{}.csv'.format(str(i)))
    
    #Formats the rotowire data
    rdf.set_axis(rw_labels, axis=1, inplace=True)
    rdf.drop(index=rdf.index[0], axis=0, inplace=True)
    
    #The Moneypuck data has information about 5-on-5, 5-on-4, 4-on-5, other, and all.
    #For this project I'm just focused on "all" since I suspect it'll give me the best results.
    mdf = mdf[mdf["situation"] == "all"]
    
    #Merges the rotowire and moneypuck dataframes
    new_data = pd.merge(rdf, mdf, on="name")
    
    #Changes the name of a few columns in the new dataframe
    new_data = new_data.rename(columns={"name":"Name","faceoffsWon":"Faceoffs_Won","faceoffsLost":"Faceoffs_Lost"})
    
    #This section calculates each player's total fantasy output for that year
    cols = new_data.columns
    fant_points = [0 for i in range(len(new_data))]
    for i in range(len(new_data)):
        for j in range(len(new_data.iloc[i])):
            mult = points_dict.get(cols[j], 0)
            if mult != 0:
                fant_points[i] += mult*int(new_data.iloc[i, j])
    
    #Adds the players' fantasy points to the new_data dataframe
    new_data["Fantasy_Points"] = fant_points
    
    #Adds new_data to the "data" array
    yearly_player_data.append(new_data)
                
    



At this point I have player data for each individual year. Now, I want to turn this data into data that I can put into a machine learning model. 

At this moment, I will only use data for players that have been in the league for 3 years, such that the model will be predicting fantasy output for their fourth year.

In [4]:
yearly_player_data[4]

Unnamed: 0,Name,Team,Pos,Games,Goals,Assists,Pts,+/-,PIM,SOG,...,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,Fantasy_Points
0,Alex Ovechkin,WAS,LW,81,53,28,81,10,58,395,...,151.32,2611.0,3283.0,0.0,0.0,0.0,0.0,0.0,0.0,621.05
1,Steven Stamkos,TB,C,82,43,29,72,2,49,268,...,156.86,2840.0,3092.0,0.0,0.0,0.0,0.0,0.0,0.0,486.65
2,Rick Nash,FA,LW,79,42,27,69,29,36,304,...,145.13,2977.0,3238.0,0.0,0.0,0.0,0.0,0.0,0.0,436.95
3,John Tavares,TOR,C,82,38,48,86,5,46,278,...,145.01,3020.0,3196.0,0.0,0.0,0.0,0.0,0.0,0.0,558.65
4,Tyler Seguin,DAL,C,71,37,40,77,-1,20,280,...,141.93,2656.0,2990.0,0.0,0.0,0.0,0.0,0.0,0.0,466.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,Dana Tyrell,FA,RW,3,0,0,0,-1,0,1,...,8.50,176.0,136.0,0.0,0.0,0.0,0.0,0.0,0.0,3.25
860,Tyson Strachan,FA,D,46,0,5,5,-30,44,38,...,97.21,1401.0,2056.0,0.0,0.0,0.0,0.0,0.0,0.0,75.25
861,Dylan Reese,FA,D,1,0,0,0,-1,0,3,...,2.46,42.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00
862,Eric Tangradi,FA,LW,7,0,0,0,-3,17,5,...,15.68,376.0,331.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50


In [6]:
def combine_dataframes(df1, df2, df3, df4):
    df1 = df1.add_prefix('1_')
    df2 = df1.add_prefix('2_')
    df3 = df1.add_prefix('3_')
    temp = df4[["Fantasy_Points"]]
    return pd.concat([df1, df2, df3, temp], axis=1)

In [25]:
ml_data = pd.DataFrame()
for i in range(2013, 2022):
    t1 = yearly_player_data[2013-i]
    t2 = yearly_player_data[2012-i]
    t3 = yearly_player_data[2011-i]
    t4 = yearly_player_data[2010-i]
    #print(combine_dataframes(t1,t2,t3,t4))
    ml_data = pd.concat([ml_data, combine_dataframes(t1,t2,t3,t4)], ignore_index=True)
    

In [29]:
print(ml_data.dropna(axis=0))

              1_Name 1_Team 1_Pos 1_Games 1_Goals 1_Assists 1_Pts 1_+/- 1_PIM  \
0        Corey Perry     TB    RW      82      50        48    98     9   104   
1     Steven Stamkos     TB     C      82      45        46    91     3    74   
2      Jarome Iginla     FA    RW      82      43        43    86     0    40   
3       Daniel Sedin     FA    LW      82      41        63   104    30    32   
4        Ryan Kesler     FA     C      82      41        32    73    24    66   
...              ...    ...   ...     ...     ...       ...   ...   ...   ...   
7962     Dana Tyrell     FA    RW       3       0         0     0    -1     0   
7963  Tyson Strachan     FA     D      46       0         5     5   -30    44   
7964     Dylan Reese     FA     D       1       0         0     0    -1     0   
7965   Eric Tangradi     FA    LW       7       0         0     0    -3    17   
7966    Corey Potter     FA     D       6       0         0     0    -1     0   

     1_SOG  ... 3_1_OffIce_