# Data Collection 

To build this optimisation model, historical NHL player statistics have been collected and colated into various csv files by moneypuck.com. This project uses the following data:

Data Source: https://www.moneypuck.com/data.htm
Last updated at 2024-06-25 05:31 ET

- Skaters (2019-2020 Season -> 2023-2024 Season)
- Lines/Pairings (2019-2020 Season -> 2023-2024 Season)
- Team Data (2019-2020 Season -> 2023-2024 Season)
- Player Data (All Players from 2007 -> now, includes birthday, height, nationality, number, position and arm)


** Future endevour to automate the data using NHL's API's 




In [156]:
# Import Required Libraries 

import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

In [157]:
# Read in files from /rawData 

# #skaters -  data consists of game data aggregated over the season for each player 
# in each game situation (eg. 5on5, 4on5, PP, etc. )

skaters19_20 = pd.read_csv("rawData/skaters/skaters19-20.csv")
skaters20_21 = pd.read_csv("rawData/skaters/skaters20-21.csv")
skaters21_22 = pd.read_csv("rawData/skaters/skaters21-22.csv")
skaters22_23 = pd.read_csv("rawData/skaters/skaters22-23.csv")
skaters23_24 = pd.read_csv("rawData/skaters/skaters23-24.csv")

#lines  - data consists of all line/dpairings that have happened throughout a season across all teams

lines19_20 = pd.read_csv("rawData/lines/lines19-20.csv")
lines20_21 = pd.read_csv("rawData/lines/lines20-21.csv")
lines21_22 = pd.read_csv("rawData/lines/lines21-22.csv")
lines22_23 = pd.read_csv("rawData/lines/lines22-23.csv")
lines23_24 = pd.read_csv("rawData/lines/lines23-24.csv")

#teams - team level information for each game situation across the season 

teams19_20 = pd.read_csv("rawData/teams/teams19-20.csv")
teams20_21 = pd.read_csv("rawData/teams/teams20-21.csv")
teams21_22 = pd.read_csv("rawData/teams/teams21-22.csv")
teams22_23 = pd.read_csv("rawData/teams/teams22-23.csv")
teams23_24 = pd.read_csv("rawData/teams/teams23-24.csv")

# Player Information including DoB, Position, Handedness

allPlayers = pd.read_csv("rawData/allPlayers.csv")


# Data Cleaning
### SKATERS 

In [158]:
## Clean Column Names to ensure concatination occurs correctly

def clean_cols(data):

    data.columns = data.columns.str.replace(' ', '')

In [159]:
# Merge all 5 seasons together into one lovely dataset! 

allYears_skaters = [skaters19_20, skaters20_21, skaters21_22, skaters22_23, skaters23_24]

for df in allYears_skaters:
    clean_cols(df)
    
skaters = pd.concat(allYears_skaters, ignore_index=True)
skaters


Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,...,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts
0,8475169,2019,Evander Kane,S.J,L,other,64,3559.0,55.0,41.11,...,11.88,7.62,77.0,105.0,0.00,0.07,0.0,2.0,0.0,2.0
1,8475169,2019,Evander Kane,S.J,L,all,64,74903.0,1518.0,46.28,...,110.25,125.20,2254.0,2440.0,0.00,0.00,0.0,0.0,0.0,0.0
2,8475169,2019,Evander Kane,S.J,L,5on5,64,56312.0,1195.0,46.28,...,88.72,88.15,1981.0,1904.0,2.03,1.24,40.0,37.0,32.0,31.0
3,8475169,2019,Evander Kane,S.J,L,4on5,64,5124.0,137.0,40.77,...,3.23,23.36,53.0,358.0,0.00,0.04,0.0,1.0,0.0,1.0
4,8475169,2019,Evander Kane,S.J,L,5on4,64,9908.0,131.0,45.09,...,5.67,1.21,131.0,21.0,0.00,0.34,0.0,4.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23365,8477488,2023,Brett Pesce,CAR,D,other,70,2731.0,48.0,30.97,...,12.35,11.97,125.0,84.0,0.04,0.00,1.0,0.0,1.0,0.0
23366,8477488,2023,Brett Pesce,CAR,D,all,70,85212.0,1725.0,41.27,...,172.40,122.21,3431.0,2154.0,0.00,0.00,0.0,0.0,0.0,0.0
23367,8477488,2023,Brett Pesce,CAR,D,5on5,70,72590.0,1415.0,41.27,...,107.20,80.61,2625.0,1751.0,4.63,0.35,115.0,17.0,89.0,12.0
23368,8477488,2023,Brett Pesce,CAR,D,4on5,70,9349.0,217.0,40.17,...,3.63,19.19,63.0,222.0,0.45,0.05,7.0,1.0,6.0,1.0


In [160]:

# Clean up Team Names so that they are uniform across all data sets  

def clean_teamName(df):
    # Replace all instances of "S.J" with "SJS" in column: 'team'
    df['team'] = df['team'].str.replace("S.J", "SJS", case=False, regex=False)
    # Replace all instances of "T.B" with "TBL" in column: 'team'
    df['team'] = df['team'].str.replace("T.B", "TBL", case=False, regex=False)
    # Replace all instances of "L.A" with "LAK" in column: 'team'
    df['team'] = df['team'].str.replace("L.A", "LAK", case=False, regex=False)
    # Replace all instances of "N.J" with "NJD" in column: 'team'
    df['team'] = df['team'].str.replace("N.J", "NJD", case=False, regex=False)
    return df


In [161]:
# Identify Potential Incorrect/Inconsistent Names in Data Set
 
skaters_names = skaters.copy()
players_names = allPlayers.copy()

skaters_names['name'] = skaters['name'].str.lower()
players_names['name'] = allPlayers['name'].str.lower()

incorrect_names = skaters_names[~skaters_names['name'].isin(players_names['name'])].groupby(['name']).sum()
incorrect_names['playerId']



name
alex barre-boulet         84797180
alex nylander             84794230
alex wennberg            169550100
alexander chmelevski      42400265
alexander kerfoot        169540420
alexei toropchenko        42401405
alexis lafreniere         84821090
benoit-olivier groulx     84808700
christopher tanev        127135350
gerald mayhew             84799330
jacob middleton          127172040
jani hakanpaa            127137375
jesse ylonen             127215870
marian studenic          127203390
matt dumba               211921400
max comtois              212000775
maxime lajoie            127189800
mitchell marner          169569660
nicholas abruzzese        42408600
nicholas merkley          84784470
nick paul                 84774260
samuel walker             42401335
thomas novak              42392190
tim stutzle               84821160
william borgen            42394200
zach sanford             169549640
Name: playerId, dtype: int64

In [162]:

def fix_names(df):
    '''Fix player names in data frames '''

    df['name'] = df['name'].str.replace("Alex Nylander", "Alexander Nylander", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexander Chmelevski", "Sasha Chmelevski", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alex Kerfoot", "Alexander Kerfoot", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexis Lafrenire", "Alexis Lafreniere", case=False, regex=False)
    df['name'] = df['name'].str.replace("Tim Sttzle", "Tim Stutzle", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alex Barr-Boulet", "Alex Barre-Boulet", case=False, regex=False)
    df['name'] = df['name'].str.replace("Bo Groulx", "Benoit-Olivier Groulx", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexander Wennberg", "Alex Wennberg", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexei Toropchenko", "Alexey Toropchenko", case=False, regex=False)
    df['name'] = df['name'].str.replace("Christopher Tanev", "Chris Tanev", case=False, regex=False)
    df['name'] = df['name'].str.replace("Gerald Mayhew", "Gerry Mayhew", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jacob Middleton", "Jake Middleton", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jani Hakanp", "Jani Hakanpaa", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jani Hakanpaaaa", "Jani Hakanpaa", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jesse Ylnen", "Jesse Ylonen", case=False, regex=False)
    df['name'] = df['name'].str.replace("Marin Studenic", "Marian Studenic", case=False, regex=False)
    df['name'] = df['name'].str.replace("Maxime Lajoie", "Max Lajoie", case=False, regex=False)
    df['name'] = df['name'].str.replace("Mitchell Marner", "Mitch Marner", case=False, regex=False)
    df['name'] = df['name'].str.replace("Nicholas Abruzzese", "Nick Abruzzese", case=False, regex=False)
    df['name'] = df['name'].str.replace("Nicholas Merkley", "Nick Merkley", case=False, regex=False)
    df['name'] = df['name'].str.replace("Nicholas Paul", "Nick Paul", case=False, regex=False)
    df['name'] = df['name'].str.replace("Samuel Walker", "Sammy Walker", case=False, regex=False)
    df['name'] = df['name'].str.replace("Thomas Novak", "Tommy Novak", case=False, regex=False)
    df['name'] = df['name'].str.replace("William Borgen", "Will Borgen", case=False, regex=False)
    df['name'] = df['name'].str.replace("Maxime Comtois", "Max Comtois", case=False, regex=False)
    df['name'] = df['name'].str.replace("Mathew Dumba", "Matt Dumba", case=False, regex=False)
    df['name'] = df['name'].str.replace("Zachary Sanford", "Zach Sanford", case=False, regex=False)
    return df



In [163]:
# CLEAN DATASETS FIXING TEAM AND PLAYER NAMES - ENSURE CONSISTENCY

allPlayers = clean_teamName(allPlayers)
allPlayers = fix_names(allPlayers)

skaters = clean_teamName(skaters)
skaters = fix_names(skaters)

In [164]:
#Convert Height from feet to cm's 

missing_heights = allPlayers['height'].isnull().sum()
print(f"Number of missing values in height: {missing_heights}")

allPlayers['height'] = allPlayers.groupby('position')['height'].transform(lambda x: x.fillna(x.mode()[0]))

allPlayers.height.value_counts()

# allPlayers[['name', 'height']].head(50)
# # # Extract feet and inches from height string
# Extract feet (X) from the height string
allPlayers['height_feet'] = allPlayers['height'].str.extract(r"(\d+)'").astype(float)

# Extract inches (Y) from the height string
allPlayers['height_inches'] = allPlayers['height'].str.extract(r'(\d+)"').astype(float)

# Convert to total inches
allPlayers['total_inches'] = allPlayers['height_feet'] * 12 + allPlayers['height_inches']

# Convert to centimeters
allPlayers['height_cm'] = allPlayers['total_inches'] * 2.54

# Drop intermediate columns
allPlayers.drop(columns=['height_feet', 'height_inches', 'total_inches', 'height'], inplace=True)
allPlayers.rename(columns={'height_cm':'height'}, inplace=True)
allPlayers

Number of missing values in height: 178


Unnamed: 0,playerId,name,position,team,birthDate,weight,nationality,shootsCatches,primaryNumber,primaryPosition,height
0,8478421,A.J. Greer,L,CGY,1996-12-14,210.0,CAN,L,24.0,L,190.50
1,8477180,Aaron Dell,G,SJS,1989-05-04,205.0,CAN,L,30.0,G,182.88
2,8465992,Aaron Downey,R,DET,1974-08-27,215.0,CAN,R,44.0,R,185.42
3,8477932,Aaron Ekblad,D,FLA,1996-02-07,220.0,CAN,R,5.0,D,193.04
4,8471451,Aaron Gagnon,C,WPG,1986-04-24,186.0,CAN,R,21.0,C,180.34
...,...,...,...,...,...,...,...,...,...,...,...
3149,8475876,Zane McIntyre,G,BOS,1992-08-20,206.0,USA,L,31.0,G,187.96
3150,8469760,Zbynek Michalek,D,ARI,1982-12-23,210.0,CZE,R,4.0,D,187.96
3151,8465009,Zdeno Chara,D,NYI,1977-03-18,250.0,SVK,L,33.0,D,205.74
3152,8476878,Zemgus Girgensons,L,BUF,1994-01-05,211.0,LVA,L,28.0,C,187.96


In [165]:
# Read all Excel files into separate DataFrames
df1 = pd.read_excel('rawData/bioInfo/bioInfo1.xlsx')
df2 = pd.read_excel('rawData/bioInfo/bioInfo2.xlsx')
df3 = pd.read_excel('rawData/bioInfo/bioInfo3.xlsx')
df4 = pd.read_excel('rawData/bioInfo/bioInfo4.xlsx')
df5 = pd.read_excel('rawData/bioInfo/bioInfo5.xlsx')
df6 = pd.read_excel('rawData/bioInfo/bioInfo6.xlsx')
df7 = pd.read_excel('rawData/bioInfo/bioInfo7.xlsx')
df8 = pd.read_excel('rawData/bioInfo/bioInfo8.xlsx')
df9 = pd.read_excel('rawData/bioInfo/bioInfo9.xlsx')
df10 = pd.read_excel('rawData/bioInfo/bioInfo10.xlsx')
df11 = pd.read_excel('rawData/bioInfo/bioInfo11.xlsx')
df12 = pd.read_excel('rawData/bioInfo/bioInfo12.xlsx')
df13 = pd.read_excel('rawData/bioInfo/bioInfo13.xlsx')
df14 = pd.read_excel('rawData/bioInfo/bioInfo14.xlsx')
df15 = pd.read_excel('rawData/bioInfo/bioInfo15.xlsx')

# Concatenate all DataFrames into one


moreInfo = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15]


for df in moreInfo:
    clean_cols(df)
    
    
moreInfo = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15])
# Display the concatenated DataFrame
moreInfo = moreInfo.drop(columns={'S/C', 'Pos', 'DOB', 'Ctry', 'Ntnlty', 'Ht', 'Wt', 'HOF', 'GP', 'G', 'A', 'P'})
moreInfo = moreInfo.rename(columns={'Player':'name'})
# Extract the first four characters and convert to an integer
moreInfo['1stSeason'] = moreInfo['1stSeason'].astype(str).str[:4].astype(int)

In [166]:
allPlayers = pd.merge(allPlayers, moreInfo, on=['name'], how='left')
allPlayers

Unnamed: 0,playerId,name,position,team,birthDate,weight,nationality,shootsCatches,primaryNumber,primaryPosition,height,BirthCity,S/P,DraftYr,Round,Overall,1stSeason
0,8478421,A.J. Greer,L,CGY,1996-12-14,210.0,CAN,L,24.0,L,190.50,Joliette,QC,2015,2,39,2016.0
1,8477180,Aaron Dell,G,SJS,1989-05-04,205.0,CAN,L,30.0,G,182.88,,,,,,
2,8465992,Aaron Downey,R,DET,1974-08-27,215.0,CAN,R,44.0,R,185.42,,,,,,
3,8477932,Aaron Ekblad,D,FLA,1996-02-07,220.0,CAN,R,5.0,D,193.04,Windsor,ON,2014,1,1,2014.0
4,8471451,Aaron Gagnon,C,WPG,1986-04-24,186.0,CAN,R,21.0,C,180.34,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3151,8475876,Zane McIntyre,G,BOS,1992-08-20,206.0,USA,L,31.0,G,187.96,,,,,,
3152,8469760,Zbynek Michalek,D,ARI,1982-12-23,210.0,CZE,R,4.0,D,187.96,,,,,,
3153,8465009,Zdeno Chara,D,NYI,1977-03-18,250.0,SVK,L,33.0,D,205.74,Trencin,--,1996,3,56,1997.0
3154,8476878,Zemgus Girgensons,L,BUF,1994-01-05,211.0,LVA,L,28.0,C,187.96,Riga,--,2012,1,14,2013.0


In [167]:
# Merge skater stats with their details 

skaters = pd.merge(skaters, allPlayers, on=['playerId'])
skaters = skaters.drop(columns={'team_y', 'position_x', 'position_y', 'name_y'}).rename(columns={'team_x': 'team', 'primaryPosition': 'position', 'name_x': 'name'})
skaters.head(8)

Unnamed: 0,playerId,season,name,team,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,...,shootsCatches,primaryNumber,position,height,BirthCity,S/P,DraftYr,Round,Overall,1stSeason
0,8475169,2019,Evander Kane,SJS,other,64,3559.0,55.0,41.11,0.5,...,L,9.0,L,187.96,Vancouver,BC,2009,1,4,2009.0
1,8475169,2019,Evander Kane,SJS,all,64,74903.0,1518.0,46.28,0.55,...,L,9.0,L,187.96,Vancouver,BC,2009,1,4,2009.0
2,8475169,2019,Evander Kane,SJS,5on5,64,56312.0,1195.0,46.28,0.48,...,L,9.0,L,187.96,Vancouver,BC,2009,1,4,2009.0
3,8475169,2019,Evander Kane,SJS,4on5,64,5124.0,137.0,40.77,0.17,...,L,9.0,L,187.96,Vancouver,BC,2009,1,4,2009.0
4,8475169,2019,Evander Kane,SJS,5on4,64,9908.0,131.0,45.09,0.88,...,L,9.0,L,187.96,Vancouver,BC,2009,1,4,2009.0
5,8480950,2019,Ilya Lyubushkin,ARI,other,51,327.0,7.0,4.58,0.24,...,R,46.0,D,187.96,Moscow,--,--,--,--,2018.0
6,8480950,2019,Ilya Lyubushkin,ARI,all,51,43351.0,989.0,7.43,0.57,...,R,46.0,D,187.96,Moscow,--,--,--,--,2018.0
7,8480950,2019,Ilya Lyubushkin,ARI,5on5,51,42523.0,958.0,7.43,0.58,...,R,46.0,D,187.96,Moscow,--,--,--,--,2018.0


In [168]:
# Extract only the stats for 5on5

fullStrength = skaters[skaters['situation'] == '5on5']

fullStrength = fullStrength.sort_values(['team', 'name'])
fullStrength



Unnamed: 0,playerId,season,name,team,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,...,shootsCatches,primaryNumber,position,height,BirthCity,S/P,DraftYr,Round,Overall,1stSeason
2327,8474641,2019,Adam Henrique,ANA,5on5,71,55793.0,1275.0,48.82,0.48,...,L,14.0,C,182.88,Brantford,ON,2008,3,82,2010.0
6017,8474641,2020,Adam Henrique,ANA,5on5,45,34324.0,777.0,22.62,0.46,...,L,14.0,C,182.88,Brantford,ON,2008,3,82,2010.0
10772,8474641,2021,Adam Henrique,ANA,5on5,58,50470.0,1101.0,43.53,0.54,...,L,14.0,C,182.88,Brantford,ON,2008,3,82,2010.0
14072,8474641,2022,Adam Henrique,ANA,5on5,62,47265.0,1069.0,32.10,0.46,...,L,14.0,C,182.88,Brantford,ON,2008,3,82,2010.0
19877,8473986,2023,Alex Killorn,ANA,5on5,63,54790.0,1071.0,35.97,0.49,...,L,17.0,L,185.42,Halifax,NS,2007,3,77,2012.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19657,8477845,2023,Trevor van Riemsdyk,WSH,5on5,70,69882.0,1400.0,11.57,0.51,...,R,57.0,D,187.96,Middletown,NJ,--,--,--,2014.0
3287,8477343,2019,Tyler Lewington,WSH,5on5,6,3536.0,78.0,-0.30,0.44,...,R,78.0,D,187.96,Edmonton,AB,2013,7,204,2018.0
17627,8482861,2022,Vincent Iorio,WSH,5on5,3,2443.0,67.0,0.27,0.28,...,R,6.0,D,193.04,Coquitlam,BC,2021,2,55,2022.0
22072,8482861,2023,Vincent Iorio,WSH,5on5,6,3537.0,83.0,0.35,0.48,...,R,6.0,D,193.04,Coquitlam,BC,2021,2,55,2022.0


### LINES

In [169]:
# Create dataset for all lines/dpairings utilised during past 5 seasons 

lines = pd.concat([lines19_20, lines20_21, lines21_22, lines22_23, lines23_24], ignore_index=True)

clean_cols(lines)
clean_teamName(lines)
lines = lines.sort_values('team')
lines


Unnamed: 0,lineId,season,name,team,position,situation,games_played,icetime,iceTimeRank,xGoalsPercentage,...,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst
1934,84735608478425,2019,Guhle-Holzer,ANA,pairing,5on5,18,8673.0,79.0,0.40,...,133.11,92.0,91.68,14.0,0.89,0.93,0.93,4.48,4.52,4.45
11505,84809508482803,2023,Lyubushkin-Zellweger,ANA,pairing,5on5,1,638.0,3.0,0.12,...,13.96,11.0,10.62,0.0,0.11,0.00,0.00,1.03,1.02,1.01
11507,84828038483490,2023,Mintyukov-Zellweger,ANA,pairing,5on5,13,876.0,102.0,0.76,...,9.53,7.0,7.34,0.0,0.06,0.00,0.00,0.34,0.34,0.34
4947,84811228482142,2021,Benoit-Drysdale,ANA,pairing,5on5,24,863.0,189.0,0.32,...,31.29,20.0,20.95,1.0,0.19,0.48,0.48,1.18,1.24,1.22
4949,84768548479372,2021,Lindholm-Mahura,ANA,pairing,5on5,10,660.0,62.0,0.88,...,2.15,2.0,2.12,0.0,0.01,0.00,0.00,0.06,0.06,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,847418984768808477944,2019,Vrana-Eller-Wilson,WSH,line,5on5,34,1802.0,339.0,0.73,...,16.45,13.0,12.67,4.0,0.13,0.16,0.16,0.68,0.64,0.64
4709,847169884741898477839,2020,Sheary-Eller-Oshie,WSH,line,5on5,7,1151.0,74.0,0.52,...,22.87,19.0,18.71,1.0,0.18,0.14,0.14,1.02,0.98,0.94
876,84754628480796,2019,Fehervary-Gudas,WSH,pairing,5on5,4,2377.0,12.0,0.55,...,28.05,23.0,23.23,5.0,0.21,0.00,0.00,1.45,1.46,1.43
3733,847418984772908477839,2020,Raffl-Eller-Sheary,WSH,line,5on5,8,4256.0,28.0,0.63,...,44.50,34.0,32.88,5.0,0.36,0.52,0.52,1.99,1.94,1.91


### TEAMS

In [170]:
correct_headers = teams19_20.columns

# Move the current headers (first row of df1) to be the first row of the data

teams22_23.loc[-1] = teams22_23.columns  # Add the headers as the first row
teams22_23.index = teams22_23.index + 1  # Shift index
teams22_23 = teams22_23.sort_index()
teams22_23.columns = correct_headers


allTeams = [teams19_20, teams20_21, teams21_22, teams22_23, teams23_24]

for team in allTeams:
    clean_cols(team)

teams = pd.concat(allTeams, ignore_index=True)

clean_cols(teams)
clean_teamName(teams)
teams = teams.sort_values('team')
teams

Unnamed: 0,team,season,name,team.1,position,situation,games_played,xGoalsPercentage,corsiPercentage,fenwickPercentage,...,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst
169,ANA,2020,ANA,ANA,Team Level,5on4,56,0.86,0.87,0.85,...,47.0,44,44.0,22,0.38,0.44,0.44,2.87,2.87,2.86
414,ANA,2021,ANA,ANA,Team Level,5on4,82,0.87,0.86,0.82,...,93.0,84.0,84.0,36.0,0.69,0.0,0.0,5.63,5.63,5.59
719,ANA,2023,ANA,ANA,Team Level,5on4,82,0.86,0.88,0.86,...,77.0,64.0,64.0,22.0,0.59,0.66,0.66,5.69,5.69,5.67
70,ANA,2019,ANA,ANA,Team Level,other,71,0.5,0.45,0.46,...,235.0,182.0,182.0,7.0,2.67,1.77,1.77,22.65,22.65,22.27
71,ANA,2019,ANA,ANA,Team Level,all,71,0.47,0.49,0.49,...,4117.05,3158.0,3170.3,417.0,33.44,34.92,35.0,195.42,196.28,192.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,WSH,2020,WSH,WSH,Team Level,4on5,56,0.15,0.13,0.15,...,379.0,287,287.0,2,5.06,4.61,4.61,24.68,24.68,23.65
184,WSH,2020,WSH,WSH,Team Level,5on4,56,0.89,0.88,0.87,...,52.0,46,46.0,20,0.38,0.0,0.0,3.04,3.04,3.03
439,WSH,2021,WSH,WSH,Team Level,5on4,82,0.89,0.84,0.82,...,117.0,104.0,104.0,34.0,0.84,0.66,0.62,5.92,5.92,5.9
437,WSH,2021,WSH,WSH,Team Level,5on5,82,0.5,0.51,0.51,...,3589.82,2655.0,2656.89,362.0,26.61,31.68,32.07,150.5,150.13,147.88


# Data Processing

In [171]:
# Split lines dataset into forward lines and defensive pairings

forwardLines = lines[lines['position'] == 'line']
dpairings = lines[lines['position'] == 'pairing']


In [172]:
# Divide the forwards and defencemen into 2 different dataframes. We don't want D-men when we are predicting our lines. 

forwards55 = fullStrength[fullStrength['position'].isin(['L', 'C', 'R'])]
defense55 = fullStrength[fullStrength['position'] == 'D']



In [173]:
# Export Dataframes to CSV files 

teams.to_csv('cleanData/teamLevel.csv')
forwardLines.to_csv('cleanData/forwardLines.csv')
dpairings.to_csv('cleanData/defensivePairings.csv')
skaters.to_csv('cleanData/allSkaters.csv')
fullStrength.to_csv('cleanData/fiveOnFive.csv')
forwards55.to_csv('cleanData/forwards5on5.csv')
defense55.to_csv('cleanData/defence5on5.csv')
