# Data Collection 

To build this optimisation model, historical NHL player statistics have been collected and colated into various csv files by moneypuck.com. This project uses the following data:

Data Source: https://www.moneypuck.com/data.htm
Last updated at 2024-06-25 05:31 ET

- Skaters (2019-2020 Season -> 2023-2024 Season)
- Lines/Pairings (2019-2020 Season -> 2023-2024 Season)
- Team Data (2019-2020 Season -> 2023-2024 Season)
- Player Data (All Players from 2007 -> now, includes birthday, height, nationality, number, position and arm)


** Future endevour to automate the data using NHL's API's 




In [602]:
# Import Required Libraries 

import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

In [603]:
# Read in files from /rawData 

# #skaters -  data consists of game data aggregated over the season for each player 
# in each game situation (eg. 5on5, 4on5, PP, etc. )

skaters19_20 = pd.read_csv("rawData/skaters/skaters19-20.csv")
skaters20_21 = pd.read_csv("rawData/skaters/skaters20-21.csv")
skaters21_22 = pd.read_csv("rawData/skaters/skaters21-22.csv")
skaters22_23 = pd.read_csv("rawData/skaters/skaters22-23.csv")
skaters23_24 = pd.read_csv("rawData/skaters/skaters23-24.csv")

#lines  - data consists

lines19_20 = pd.read_csv("rawData/lines/lines19-20.csv")
lines20_21 = pd.read_csv("rawData/lines/lines20-21.csv")
lines21_22 = pd.read_csv("rawData/lines/lines21-22.csv")
lines22_23 = pd.read_csv("rawData/lines/lines22-23.csv")
lines23_24 = pd.read_csv("rawData/lines/lines23-24.csv")

#teams 

teams19_20 = pd.read_csv("rawData/teams/teams19-20.csv")
teams20_21 = pd.read_csv("rawData/teams/teams20-21.csv")
teams21_22 = pd.read_csv("rawData/teams/teams21-22.csv")
teams22_23 = pd.read_csv("rawData/teams/teams22-23.csv")
teams23_24 = pd.read_csv("rawData/teams/teams23-24.csv")

# Player Information (DoB, Position, Handedness)

allPlayers = pd.read_csv("rawData/allPlayers.csv")


# Data Cleaning
### SKATERS 

In [604]:
## Clean Column Names 

def clean_cols(data):

    data.columns = data.columns.str.replace(' ', '')

In [605]:
# Merge all 5 seasons together into one lovely dataset! 

allYears_skaters = [skaters19_20, skaters20_21, skaters21_22, skaters22_23, skaters23_24]

for df in allYears_skaters:
    clean_cols(df)
    
skaters = pd.concat(allYears_skaters, ignore_index=True)
skaters


Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,...,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts
0,8475169,2019,Evander Kane,S.J,L,other,64,3559.0,55.0,41.11,...,11.88,7.62,77.0,105.0,0.00,0.07,0.0,2.0,0.0,2.0
1,8475169,2019,Evander Kane,S.J,L,all,64,74903.0,1518.0,46.28,...,110.25,125.20,2254.0,2440.0,0.00,0.00,0.0,0.0,0.0,0.0
2,8475169,2019,Evander Kane,S.J,L,5on5,64,56312.0,1195.0,46.28,...,88.72,88.15,1981.0,1904.0,2.03,1.24,40.0,37.0,32.0,31.0
3,8475169,2019,Evander Kane,S.J,L,4on5,64,5124.0,137.0,40.77,...,3.23,23.36,53.0,358.0,0.00,0.04,0.0,1.0,0.0,1.0
4,8475169,2019,Evander Kane,S.J,L,5on4,64,9908.0,131.0,45.09,...,5.67,1.21,131.0,21.0,0.00,0.34,0.0,4.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23365,8477488,2023,Brett Pesce,CAR,D,other,70,2731.0,48.0,30.97,...,12.35,11.97,125.0,84.0,0.04,0.00,1.0,0.0,1.0,0.0
23366,8477488,2023,Brett Pesce,CAR,D,all,70,85212.0,1725.0,41.27,...,172.40,122.21,3431.0,2154.0,0.00,0.00,0.0,0.0,0.0,0.0
23367,8477488,2023,Brett Pesce,CAR,D,5on5,70,72590.0,1415.0,41.27,...,107.20,80.61,2625.0,1751.0,4.63,0.35,115.0,17.0,89.0,12.0
23368,8477488,2023,Brett Pesce,CAR,D,4on5,70,9349.0,217.0,40.17,...,3.63,19.19,63.0,222.0,0.45,0.05,7.0,1.0,6.0,1.0


In [606]:
# Identify Potential Incorrect Names in Data Set 
skaters_names = skaters.copy()
players_names = allPlayers.copy()

skaters_names['name'] = skaters['name'].str.lower()
players_names['name'] = allPlayers['name'].str.lower()

incorrect_names = skaters[~skaters['name'].isin(allPlayers['name'])].groupby(['name']).sum()
incorrect_names['playerId']



name
Alex Barre-Boulet         84797180
Alex Nylander             84794230
Alex Wennberg            169550100
Alexander Chmelevski      42400265
Alexander Kerfoot        169540420
Alexei Toropchenko        42401405
Alexis Lafreniere         84821090
Benoit-Olivier Groulx     84808700
Christopher Tanev        127135350
Gerald Mayhew             84799330
Jacob Middleton          127172040
Jani Hakanpaa            127137375
Jesse Ylonen             127215870
Marian Studenic          127203390
Matt Dumba               211921400
Max Comtois              212000775
Maxime Lajoie            127189800
Mitchell Marner          169569660
Nicholas Abruzzese        42408600
Nicholas Merkley          84784470
Nick Paul                 84774260
Samuel Walker             42401335
Thomas Novak              42392190
Tim Stutzle               84821160
William Borgen            42394200
Zach Sanford             169549640
Name: playerId, dtype: int64

In [607]:

# Clean up Team Names so that they are all uniform 

def clean_teamName(df):
    # Replace all instances of "S.J" with "SJS" in column: 'team'
    df['team'] = df['team'].str.replace("S.J", "SJS", case=False, regex=False)
    # Replace all instances of "T.B" with "TBL" in column: 'team'
    df['team'] = df['team'].str.replace("T.B", "TBL", case=False, regex=False)
    # Replace all instances of "L.A" with "LAK" in column: 'team'
    df['team'] = df['team'].str.replace("L.A", "LAK", case=False, regex=False)
    # Replace all instances of "N.J" with "NJD" in column: 'team'
    df['team'] = df['team'].str.replace("N.J", "NJD", case=False, regex=False)
    return df


In [608]:

def fix_names(df):
    '''Fix player names in data frames '''

    df['name'] = df['name'].str.replace("Alex Nylander", "Alexander Nylander", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexander Chmelevski", "Sasha Chmelevski", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alex Kerfoot", "Alexander Kerfoot", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexis Lafrenire", "Alexis Lafreniere", case=False, regex=False)
    df['name'] = df['name'].str.replace("Tim Sttzle", "Tim Stutzle", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alex Barr-Boulet", "Alex Barre-Boulet", case=False, regex=False)
    df['name'] = df['name'].str.replace("Bo Groulx", "Benoit-Olivier Groulx", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexander Wennberg", "Alex Wennberg", case=False, regex=False)
    df['name'] = df['name'].str.replace("Alexei Toropchenko", "Alexey Toropchenko", case=False, regex=False)
    df['name'] = df['name'].str.replace("Christopher Tanev", "Chris Tanev", case=False, regex=False)
    df['name'] = df['name'].str.replace("Gerald Mayhew", "Gerry Mayhew", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jacob Middleton", "Jake Middleton", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jani Hakanp", "Jani Hakanpaa", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jani Hakanpaaaa", "Jani Hakanpaa", case=False, regex=False)
    df['name'] = df['name'].str.replace("Jesse Ylnen", "Jesse Ylonen", case=False, regex=False)
    df['name'] = df['name'].str.replace("Marin Studenic", "Marian Studenic", case=False, regex=False)
    df['name'] = df['name'].str.replace("Maxime Lajoie", "Max Lajoie", case=False, regex=False)
    df['name'] = df['name'].str.replace("Mitchell Marner", "Mitch Marner", case=False, regex=False)
    df['name'] = df['name'].str.replace("Nicholas Abruzzese", "Nick Abruzzese", case=False, regex=False)
    df['name'] = df['name'].str.replace("Nicholas Merkley", "Nick Merkley", case=False, regex=False)
    df['name'] = df['name'].str.replace("Nicholas Paul", "Nick Paul", case=False, regex=False)
    df['name'] = df['name'].str.replace("Samuel Walker", "Sammy Walker", case=False, regex=False)
    df['name'] = df['name'].str.replace("Thomas Novak", "Tommy Novak", case=False, regex=False)
    df['name'] = df['name'].str.replace("William Borgen", "Will Borgen", case=False, regex=False)
    df['name'] = df['name'].str.replace("Maxime Comtois", "Max Comtois", case=False, regex=False)
    df['name'] = df['name'].str.replace("Mathew Dumba", "Matt Dumba", case=False, regex=False)
    df['name'] = df['name'].str.replace("Zachary Sanford", "Zach Sanford", case=False, regex=False)
    return df



In [609]:
# CLEAN DATASETS 

allPlayers = clean_teamName(allPlayers)
allPlayers = fix_names(allPlayers)

skaters = clean_teamName(skaters)
skaters = fix_names(skaters)

allPlayers

Unnamed: 0,playerId,name,position,team,birthDate,weight,height,nationality,shootsCatches,primaryNumber,primaryPosition
0,8478421,A.J. Greer,L,CGY,1996-12-14,210.0,"6' 3""",CAN,L,24.0,L
1,8477180,Aaron Dell,G,SJS,1989-05-04,205.0,"6' 0""",CAN,L,30.0,G
2,8465992,Aaron Downey,R,DET,1974-08-27,215.0,"6' 1""",CAN,R,44.0,R
3,8477932,Aaron Ekblad,D,FLA,1996-02-07,220.0,"6' 4""",CAN,R,5.0,D
4,8471451,Aaron Gagnon,C,WPG,1986-04-24,186.0,"5' 11""",CAN,R,21.0,C
...,...,...,...,...,...,...,...,...,...,...,...
3149,8475876,Zane McIntyre,G,BOS,1992-08-20,206.0,"6' 2""",USA,L,31.0,G
3150,8469760,Zbynek Michalek,D,ARI,1982-12-23,210.0,"6' 2""",CZE,R,4.0,D
3151,8465009,Zdeno Chara,D,NYI,1977-03-18,250.0,"6' 9""",SVK,L,33.0,D
3152,8476878,Zemgus Girgensons,L,BUF,1994-01-05,211.0,"6' 2""",LVA,L,28.0,C


In [610]:
# Extract only the stats for 5on5 (could look into special teams later )

fullStrength = skaters[skaters['situation'] == '5on5']

fullStrength = fullStrength.sort_values(['team', 'name'])
fullStrength



Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,...,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts
2322,8474641,2019,Adam Henrique,ANA,C,5on5,71,55793.0,1275.0,48.82,...,89.98,102.05,2216.0,2489.0,2.11,1.39,47.0,34.0,34.0,25.0
6012,8474641,2020,Adam Henrique,ANA,C,5on5,45,34324.0,777.0,22.62,...,55.77,69.11,1374.0,1536.0,1.39,0.62,30.0,18.0,24.0,11.0
10752,8474641,2021,Adam Henrique,ANA,C,5on5,58,50470.0,1101.0,43.53,...,72.98,83.27,1717.0,1994.0,1.79,1.48,36.0,31.0,28.0,26.0
14047,8474641,2022,Adam Henrique,ANA,C,5on5,62,47265.0,1069.0,32.10,...,78.19,128.08,1863.0,2538.0,1.61,1.00,38.0,18.0,31.0,14.0
19842,8473986,2023,Alex Killorn,ANA,L,5on5,63,54790.0,1071.0,35.97,...,74.08,88.70,1711.0,2096.0,1.12,1.82,18.0,37.0,16.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19622,8477845,2023,Trevor van Riemsdyk,WSH,D,5on5,70,69882.0,1400.0,11.57,...,90.18,104.72,2117.0,2398.0,5.45,1.64,128.0,27.0,97.0,19.0
3282,8477343,2019,Tyler Lewington,WSH,D,5on5,6,3536.0,78.0,-0.30,...,8.32,9.13,217.0,232.0,0.44,0.00,7.0,0.0,6.0,0.0
17592,8482861,2022,Vincent Iorio,WSH,D,5on5,3,2443.0,67.0,0.27,...,3.20,6.81,79.0,108.0,0.04,0.00,3.0,0.0,2.0,0.0
22037,8482861,2023,Vincent Iorio,WSH,D,5on5,6,3537.0,83.0,0.35,...,9.87,8.46,259.0,260.0,0.11,0.02,6.0,2.0,5.0,2.0


### LINES

In [611]:
# Create dataset for all lines/dpairings utilised during past 5 seasons 

lines = pd.concat([lines19_20, lines20_21, lines21_22, lines22_23, lines23_24], ignore_index=True)

clean_cols(lines)
clean_teamName(lines)
lines = lines.sort_values('team')
lines


Unnamed: 0,lineId,season,name,team,position,situation,games_played,icetime,iceTimeRank,xGoalsPercentage,...,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst
1934,84735608478425,2019,Guhle-Holzer,ANA,pairing,5on5,18,8673.0,79.0,0.40,...,133.11,92.0,91.68,14.0,0.89,0.93,0.93,4.48,4.52,4.45
11505,84809508482803,2023,Lyubushkin-Zellweger,ANA,pairing,5on5,1,638.0,3.0,0.12,...,13.96,11.0,10.62,0.0,0.11,0.00,0.00,1.03,1.02,1.01
11507,84828038483490,2023,Mintyukov-Zellweger,ANA,pairing,5on5,13,876.0,102.0,0.76,...,9.53,7.0,7.34,0.0,0.06,0.00,0.00,0.34,0.34,0.34
4947,84811228482142,2021,Benoit-Drysdale,ANA,pairing,5on5,24,863.0,189.0,0.32,...,31.29,20.0,20.95,1.0,0.19,0.48,0.48,1.18,1.24,1.22
4949,84768548479372,2021,Lindholm-Mahura,ANA,pairing,5on5,10,660.0,62.0,0.88,...,2.15,2.0,2.12,0.0,0.01,0.00,0.00,0.06,0.06,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,847418984768808477944,2019,Vrana-Eller-Wilson,WSH,line,5on5,34,1802.0,339.0,0.73,...,16.45,13.0,12.67,4.0,0.13,0.16,0.16,0.68,0.64,0.64
4709,847169884741898477839,2020,Sheary-Eller-Oshie,WSH,line,5on5,7,1151.0,74.0,0.52,...,22.87,19.0,18.71,1.0,0.18,0.14,0.14,1.02,0.98,0.94
876,84754628480796,2019,Fehervary-Gudas,WSH,pairing,5on5,4,2377.0,12.0,0.55,...,28.05,23.0,23.23,5.0,0.21,0.00,0.00,1.45,1.46,1.43
3733,847418984772908477839,2020,Raffl-Eller-Sheary,WSH,line,5on5,8,4256.0,28.0,0.63,...,44.50,34.0,32.88,5.0,0.36,0.52,0.52,1.99,1.94,1.91


### TEAMS

In [612]:
correct_headers = teams19_20.columns

# Move the current headers (first row of df1) to be the first row of the data

teams22_23.loc[-1] = teams22_23.columns  # Add the headers as the first row
teams22_23.index = teams22_23.index + 1  # Shift index
teams22_23 = teams22_23.sort_index()
teams22_23.columns = correct_headers


allTeams = [teams19_20, teams20_21, teams21_22, teams22_23, teams23_24]

for team in allTeams:
    clean_cols(team)

teams = pd.concat(allTeams, ignore_index=True)

clean_cols(teams)
clean_teamName(teams)
teams = teams.sort_values('team')
teams

Unnamed: 0,team,season,name,team.1,position,situation,games_played,xGoalsPercentage,corsiPercentage,fenwickPercentage,...,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst
169,ANA,2020,ANA,ANA,Team Level,5on4,56,0.86,0.87,0.85,...,47.0,44,44.0,22,0.38,0.44,0.44,2.87,2.87,2.86
414,ANA,2021,ANA,ANA,Team Level,5on4,82,0.87,0.86,0.82,...,93.0,84.0,84.0,36.0,0.69,0.0,0.0,5.63,5.63,5.59
719,ANA,2023,ANA,ANA,Team Level,5on4,82,0.86,0.88,0.86,...,77.0,64.0,64.0,22.0,0.59,0.66,0.66,5.69,5.69,5.67
70,ANA,2019,ANA,ANA,Team Level,other,71,0.5,0.45,0.46,...,235.0,182.0,182.0,7.0,2.67,1.77,1.77,22.65,22.65,22.27
71,ANA,2019,ANA,ANA,Team Level,all,71,0.47,0.49,0.49,...,4117.05,3158.0,3170.3,417.0,33.44,34.92,35.0,195.42,196.28,192.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,WSH,2020,WSH,WSH,Team Level,4on5,56,0.15,0.13,0.15,...,379.0,287,287.0,2,5.06,4.61,4.61,24.68,24.68,23.65
184,WSH,2020,WSH,WSH,Team Level,5on4,56,0.89,0.88,0.87,...,52.0,46,46.0,20,0.38,0.0,0.0,3.04,3.04,3.03
439,WSH,2021,WSH,WSH,Team Level,5on4,82,0.89,0.84,0.82,...,117.0,104.0,104.0,34.0,0.84,0.66,0.62,5.92,5.92,5.9
437,WSH,2021,WSH,WSH,Team Level,5on5,82,0.5,0.51,0.51,...,3589.82,2655.0,2656.89,362.0,26.61,31.68,32.07,150.5,150.13,147.88


# Data Processing

In [613]:
# LINES 

forwardLines = lines[lines['position'] == 'line']
dpairings = lines[lines['position'] == 'pairing']

forwardLines.columns


Index(['lineId', 'season', 'name', 'team', 'position', 'situation',
       'games_played', 'icetime', 'iceTimeRank', 'xGoalsPercentage',
       ...
       'scoreAdjustedShotsAttemptsAgainst', 'unblockedShotAttemptsAgainst',
       'scoreAdjustedUnblockedShotAttemptsAgainst', 'dZoneGiveawaysAgainst',
       'xGoalsFromxReboundsOfShotsAgainst',
       'xGoalsFromActualReboundsOfShotsAgainst', 'reboundxGoalsAgainst',
       'totalShotCreditAgainst', 'scoreAdjustedTotalShotCreditAgainst',
       'scoreFlurryAdjustedTotalShotCreditAgainst'],
      dtype='object', length=108)

In [614]:
fiveOnFive = pd.merge(fullStrength, allPlayers, on=['playerId', 'name', 'position'])
fiveOnFive = fiveOnFive.drop(columns={'team_y', 'primaryPosition'}).rename(columns={'team_x': 'team'})
fiveOnFive



Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,...,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shootsCatches,primaryNumber
0,8474641,2019,Adam Henrique,ANA,C,5on5,71,55793.0,1275.0,48.82,...,47.0,34.0,34.0,25.0,1990-02-06,197.0,"6' 0""",CAN,L,14.0
1,8474641,2020,Adam Henrique,ANA,C,5on5,45,34324.0,777.0,22.62,...,30.0,18.0,24.0,11.0,1990-02-06,197.0,"6' 0""",CAN,L,14.0
2,8474641,2021,Adam Henrique,ANA,C,5on5,58,50470.0,1101.0,43.53,...,36.0,31.0,28.0,26.0,1990-02-06,197.0,"6' 0""",CAN,L,14.0
3,8474641,2022,Adam Henrique,ANA,C,5on5,62,47265.0,1069.0,32.10,...,38.0,18.0,31.0,14.0,1990-02-06,197.0,"6' 0""",CAN,L,14.0
4,8473986,2023,Alex Killorn,ANA,L,5on5,63,54790.0,1071.0,35.97,...,18.0,37.0,16.0,30.0,1989-09-14,196.0,"6' 1""",CAN,L,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4552,8477845,2023,Trevor van Riemsdyk,WSH,D,5on5,70,69882.0,1400.0,11.57,...,128.0,27.0,97.0,19.0,1991-07-24,192.0,"6' 2""",USA,R,57.0
4553,8477343,2019,Tyler Lewington,WSH,D,5on5,6,3536.0,78.0,-0.30,...,7.0,0.0,6.0,0.0,1994-12-05,202.0,"6' 2""",CAN,R,78.0
4554,8482861,2022,Vincent Iorio,WSH,D,5on5,3,2443.0,67.0,0.27,...,3.0,0.0,2.0,0.0,2002-11-14,200.0,"6' 4""",CAN,R,6.0
4555,8482861,2023,Vincent Iorio,WSH,D,5on5,6,3537.0,83.0,0.35,...,6.0,2.0,5.0,2.0,2002-11-14,200.0,"6' 4""",CAN,R,6.0


In [615]:
# Divide the forwards and defencemen into 2 different dataframes. We don't want D-men when we are predicting our lines. 

forwards55 = fiveOnFive[fiveOnFive['position'].isin(['L', 'C', 'R'])]
defense55 = fiveOnFive[fiveOnFive['position'] == 'D']



# Exploratory Analysis - What factors lead to the most optimal lines? 

To help us identify, what factors lead to the most optimal lines, we should look subjectively to what are the best lines in the NHL. Of course if you ask anyone, there will always be different opinions so I have taken into account 3 different websites to see what they thought the best lines of the 2022-23 season were. Stats that were generally used to determine these lines this were Goals Percentage (Goal%) and expected goals percentage (xGoals%), along with minutes played, actual goals for and against as well as expected goals for and against.

ESPN: https://www.espn.com.au/nhl/insider/insider/story/_/id/35204274/ranking-best-lines-nhl-2022-23-season

1. Dallas Stars: Jason Robertson-Roope Hintz-Joe Pavelski 
2. Boston Bruins: Brad Marchand-Patrice Bergeron-Jake Debrusk
3. Vegas Golden Knights: Chandler Stephenson-Jack Eichel-Mark Stone
4. Florida Panthers: Carter Verhaeghe-Aleksander Barkov-Matthew Tkachuk
5. New Jersey Devils: Fabian Zetterland-Nico Hischier-Tomas Tatar

The Hockey Writers: https://thehockeywriters.com/nhl-top-5-forward-lines-2022-23/

1. Calgary Flames: Andrew Mangiapane-Mikael Backlund-Blake Coleman
2. Boston Bruins: Brad Marchand-Patrice Bergeron-Jake Debrusk
3. Toronto Maple Leafs: Michael Bunting-Auston Matthews-William Nylander
4. Dallas Stars: Jason Robertson-Roope Hintz-Joe Pavelski 
5. LA Kings: Quinton Byfield-Anze Kopitar-Adrian Kempe

Bleacher Report: https://bleacherreport.com/articles/10053471-ranking-the-top-15-best-forward-lines-early-in-the-2022-23-nhl-season

1. Carolina Hurricanes: Andrei Svechnikov-Matin Necas-Jesper Kotkaniemi 
2. Dallas Stars: Jason Robertson-Roope Hintz-Joe Pavelski 
3. Pittsburgh Penguins: Jake Guentzel-Sidney Crosby-Rickard Rakell
4. Boston Bruins: Taylor Hall-David Krejci-David Pastrnak
5. Toronto Maple Leafs: Michael Bunting-Auston Matthews-Mitch Marner

https://bleacherreport.com/articles/10095790-ranking-the-nhls-top-5-lines-for-the-2023-24-season

1. Dallas Stars: Jason Robertson-Roope Hintz-Joe Pavelski 
2. Edmonton Oilers: Leon Draisatl-Connor McDavid-Zach Hyman
3. LA Kings: Quinton Byfield-Anze Kopitar-Adrian Kempe
4. Ottawa Senators: Brady Tkachuk-Tim Stutzle-Claude Giroux
5. Pittsburgh Penguins: Jake Guentzel-Sidney Crosby-Bryan Rust 

Sportskeeda: https://www.sportskeeda.com/us/nhl/ranking-top-5-forward-lines-2024-nhl-season

1. Florida Panthers: Sam Reinhart-Aleksander Barkov-Evan Rodriguez
2. Dallas Stars: Mason Marchment-Tyler Seguin-Matt Duchene
3. Vancouver Canucks: Dakota Joshua-Teddy Blueger-Connor Garland
4. Calgary Flames: Connor Zary-Nazem Kadri-Martin Pospisil 
5. LA Kings: Quinton Byfield-Anze Kopitar-Adrian Kempe

In [616]:
forwardLines = forwardLines[['lineId', 'season', 'name', 'team', 'games_played', 'icetime', 'xGoalsPercentage',
                  'xGoalsFor', 'xGoalsAgainst', 'shotsOnGoalFor', 'shotsOnGoalAgainst', 'goalsFor', 'goalsAgainst', 'penaltiesFor', 
                   'penaltiesAgainst', 'faceOffsWonFor', 'hitsFor']]

forwardLines.head()

Unnamed: 0,lineId,season,name,team,games_played,icetime,xGoalsPercentage,xGoalsFor,xGoalsAgainst,shotsOnGoalFor,shotsOnGoalAgainst,goalsFor,goalsAgainst,penaltiesFor,penaltiesAgainst,faceOffsWonFor,hitsFor
454,847464184751648479368,2019,Jones-Henrique-Silfverberg,ANA,16,3458.0,0.47,1.42,1.6,21.0,30.0,1.0,4.0,2.0,5.0,29.0,14.0
3339,847061284788738480186,2020,Terry-Getzlaf-Volkov,ANA,4,935.0,0.31,0.42,0.94,7.0,10.0,0.0,2.0,2.0,3.0,9.0,5.0
2571,847464184780468478873,2020,Heinen-Henrique-Terry,ANA,6,709.0,0.6,0.54,0.36,5.0,5.0,0.0,1.0,1.0,1.0,3.0,6.0
11519,847516484808068482118,2023,Colangelo-Lundestrom-Silfverberg,ANA,2,1031.0,0.41,0.77,1.09,9.0,6.0,1.0,1.0,2.0,1.0,10.0,7.0
9085,847836684815178482745,2022,Vatrano-Mctavish-Leason,ANA,15,2032.0,0.32,0.95,2.05,9.0,25.0,0.0,4.0,4.0,4.0,11.0,9.0


In [617]:
forwardsSortedIceTime = forwardLines.sort_values(by=['icetime'], ascending=False)
forwardsSortedIceTime

Unnamed: 0,lineId,season,name,team,games_played,icetime,xGoalsPercentage,xGoalsFor,xGoalsAgainst,shotsOnGoalFor,shotsOnGoalAgainst,goalsFor,goalsAgainst,penaltiesFor,penaltiesAgainst,faceOffsWonFor,hitsFor
7282,847634684774968479314,2021,Gaudreau-Lindholm-Tkachuk,CGY,82,57915.0,0.62,54.06,32.86,584.0,439.0,72.0,31.0,50.0,60.0,552.0,230.0
12284,847638984785508482109,2023,Lafrenire-Trocheck-Panarin,NYR,74,51785.0,0.56,45.04,35.89,530.0,416.0,54.0,39.0,53.0,57.0,358.0,318.0
6778,847079484784498480027,2021,Robertson-Hintz-Pavelski,DAL,72,47465.0,0.59,43.29,29.47,473.0,381.0,51.0,35.0,28.0,34.0,473.0,244.0
12014,847467984751588476887,2023,Forsberg-O'Reilly-Nyquist,NSH,78,45988.0,0.55,39.90,33.30,453.0,394.0,46.0,31.0,47.0,51.0,512.0,259.0
8661,847079484784498480027,2022,Robertson-Pavelski-Hintz,DAL,73,45886.0,0.59,41.04,28.28,436.0,344.0,52.0,24.0,30.0,51.0,472.0,229.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11449,847531484784458481601,2023,Lee-Barzal-Holmstrom,NYI,3,600.0,0.58,0.18,0.13,7.0,3.0,0.0,0.0,0.0,1.0,5.0,3.0
493,847123384751518479407,2019,Bratt-Zajac-Palmieri,NJD,5,600.0,0.70,0.34,0.15,5.0,4.0,1.0,0.0,1.0,0.0,6.0,6.0
3777,847405384751698478414,2020,Kane-Couture-Meier,SJS,19,600.0,0.09,0.05,0.52,4.0,8.0,1.0,2.0,3.0,1.0,2.0,5.0
12123,847341984757458478409,2023,Marchand-Coyle-Richard,BOS,4,600.0,0.21,0.33,1.28,6.0,6.0,0.0,1.0,0.0,0.0,5.0,4.0


In [618]:
weights = {
    'icetimeMins': 0.03,
    'normalised_games_played': 0.2,
    'xGoalsPercentage': 0.6, 
    'xGoalsFor': 0.35, 
    'xGoalsAgainst': -0.35,
    'shotsOnGoalFor': 0.2,
    'shotsOnGoalAgainst': -0.2,  # Negative weight for shots against
    'goalsFor': 0.25,
    'goalsAgainst': -0.25,  # Negative weight for goals against
    'penaltiesFor': 0.01,  # Negative weight for penalties (you might want to penalize these) # Negative weight for penalty minutes
    'penaltiesAgainst': -0.01,  
    'faceOffsWonFor': 0.15,
    'hitsFor': 0.05
}

metrics = ['xGoalsFor', 
    'xGoalsAgainst',
    'shotsOnGoalFor',
    'shotsOnGoalAgainst',  # Negative weight for shots against
    'goalsFor',
    'goalsAgainst',  # Negative weight for goals against
    'penaltiesFor',  # Negative weight for penalties (you might want to penalize these) # Negative weight for penalty minutes
    'penaltiesAgainst',  
    'faceOffsWonFor',
    'hitsFor']

# Normalise metrics so that they represent the stat/60 minutes of icetime played 

forwardLines['icetimeMins'] = forwardLines['icetime']/60

normalised = forwardLines.copy()
for metric in metrics:
    normalised[metric] = (forwardLines[metric] / forwardLines['icetimeMins']) * 60
    
# Define the actual games played in each season
season_lengths = {
    2019: 70,
    2020: 56,
    2021: 82,
    2022: 82,
    2023: 82
}

# Normalize games_played by scaling to an 82-game season
normalised['normalised_games_played'] = normalised.apply(
    lambda row: row['games_played'] * (82 / season_lengths[row['season']]),
    axis=1
)

    
normalised = normalised.sort_values(by=['icetimeMins'], ascending=False)
normalised['name'] = normalised['name'].str.replace(' ', '')

normalised.head()

Unnamed: 0,lineId,season,name,team,games_played,icetime,xGoalsPercentage,xGoalsFor,xGoalsAgainst,shotsOnGoalFor,shotsOnGoalAgainst,goalsFor,goalsAgainst,penaltiesFor,penaltiesAgainst,faceOffsWonFor,hitsFor,icetimeMins,normalised_games_played
7282,847634684774968479314,2021,Gaudreau-Lindholm-Tkachuk,CGY,82,57915.0,0.62,3.360373,2.04258,36.301476,27.288267,4.475524,1.926962,3.108003,3.729604,34.312354,14.296814,965.25,82.0
12284,847638984785508482109,2023,Lafrenire-Trocheck-Panarin,NYR,74,51785.0,0.56,3.1311,2.495008,36.844646,28.919571,3.753983,2.71121,3.684465,3.962537,24.887516,22.106788,863.083333,74.0
6778,847079484784498480027,2021,Robertson-Hintz-Pavelski,DAL,72,47465.0,0.59,3.283346,2.235163,35.874855,28.897082,3.868113,2.654588,2.12367,2.578742,35.874855,18.506268,791.083333,72.0
12014,847467984751588476887,2023,Forsberg-O'Reilly-Nyquist,NSH,78,45988.0,0.55,3.123424,2.606767,35.461425,30.842829,3.600939,2.42672,3.679221,3.992346,40.080021,20.274854,766.466667,78.0
8661,847079484784498480027,2022,Robertson-Pavelski-Hintz,DAL,73,45886.0,0.59,3.219806,2.218716,34.206512,26.988624,4.079676,1.882927,2.353659,4.00122,37.030903,17.966264,764.766667,73.0


In [619]:
normalised['line_score'] = normalised[weights.keys()].dot(pd.Series(weights))
bestLines = normalised.sort_values(by=['line_score'], ascending=False)

bestLines['rank'] = range(1, len(bestLines) + 1)
bestLines

Unnamed: 0,lineId,season,name,team,games_played,icetime,xGoalsPercentage,xGoalsFor,xGoalsAgainst,shotsOnGoalFor,...,goalsFor,goalsAgainst,penaltiesFor,penaltiesAgainst,faceOffsWonFor,hitsFor,icetimeMins,normalised_games_played,line_score,rank
7282,847634684774968479314,2021,Gaudreau-Lindholm-Tkachuk,CGY,82,57915.0,0.62,3.360373,2.042580,36.301476,...,4.475524,1.926962,3.108003,3.729604,34.312354,14.296814,965.250000,82.000000,54.485988,1
12284,847638984785508482109,2023,Lafrenire-Trocheck-Panarin,NYR,74,51785.0,0.56,3.131100,2.495008,36.844646,...,3.753983,2.711210,3.684465,3.962537,24.887516,22.106788,863.083333,74.000000,47.932526,2
10858,847353384758558476921,2022,Martinook-Staal-Fast,CAR,76,44349.0,0.63,3.365487,2.010688,32.713252,...,2.922276,2.110532,3.977542,3.490496,41.398904,21.267672,739.150000,76.000000,47.850707,3
12014,847467984751588476887,2023,Forsberg-O'Reilly-Nyquist,NSH,78,45988.0,0.55,3.123424,2.606767,35.461425,...,3.600939,2.426720,3.679221,3.992346,40.080021,20.274854,766.466667,78.000000,47.344718,4
6778,847079484784498480027,2021,Robertson-Hintz-Pavelski,DAL,72,47465.0,0.59,3.283346,2.235163,35.874855,...,3.868113,2.654588,2.123670,2.578742,35.874855,18.506268,791.083333,72.000000,46.854291,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4662,847577284815288481626,2020,Ruotsalainen-Sheahan-Cozens,BUF,3,834.0,0.21,2.460432,9.366906,38.848921,...,0.000000,8.633094,0.000000,0.000000,17.266187,0.000000,13.900000,4.392857,-10.923752,9369
8630,847416184784588482475,2022,Voracek-Roslovic-Chinakhov,CBJ,6,842.0,0.09,0.384798,4.104513,12.826603,...,0.000000,12.826603,0.000000,0.000000,21.377672,12.826603,14.033333,6.000000,-10.957067,9370
10112,847398684745648476453,2022,Killorn-Stamkos-Kucherov,TBL,8,724.0,0.10,0.895028,8.154696,9.944751,...,4.972376,19.889503,9.944751,0.000000,24.861878,19.889503,12.066667,8.000000,-12.353138,9371
4783,847354884779518479465,2020,Caggiula-Schmaltz-Kessel,ARI,7,773.0,0.11,0.652005,5.216041,18.628719,...,0.000000,13.971539,0.000000,0.000000,4.657180,23.285899,12.883333,10.250000,-12.833593,9372


In [620]:
best = ['Robertson-Pavelski-Hintz', 'Marchand-Bergeron-Debrusk','Stephenson-Eichel-Stone',
             'Tkachuk-Barkov-Verhaeghe', 'Tatar-Hischier-Zetterlund', 'Mangiapane-Backlund-Coleman', 'Debrusk-Bergeron-Marchand',
             'Bunting-Matthews-Nylander', 'Byfield-Kopitar-Kempe', 'Necas-Kotkaniemi-Svechnikov', 'Guentzel-Crosby-Rakell', 
             'Hall-Krejci-Pastrnak', 'Bunting-Matthews-Marner', 'Tkachuk-Giroux-Sttzle', 'Guentzel-Crosby-Rust', 'Reinhart-Barkov-Rodrigues',
             'Marchment-Seguin-Duchene', 'Joshua-Blueger-Garland', 'Zary-Kadri-Pospisil']


best = bestLines[bestLines['name'].isin(best)]
best

Unnamed: 0,lineId,season,name,team,games_played,icetime,xGoalsPercentage,xGoalsFor,xGoalsAgainst,shotsOnGoalFor,...,goalsFor,goalsAgainst,penaltiesFor,penaltiesAgainst,faceOffsWonFor,hitsFor,icetimeMins,normalised_games_played,line_score,rank
8661,847079484784498480027,2022,Robertson-Pavelski-Hintz,DAL,73,45886.0,0.59,3.219806,2.218716,34.206512,...,4.079676,1.882927,2.353659,4.00122,37.030903,17.966264,764.766667,73.0,46.676619,6
12292,847079484784498480027,2023,Robertson-Pavelski-Hintz,DAL,75,43125.0,0.55,2.645426,2.184626,31.22087,...,3.422609,2.754783,1.92,3.589565,40.403478,18.198261,718.75,75.0,45.293084,7
3150,847167584758108477404,2020,Guentzel-Crosby-Rust,PIT,51,38830.0,0.52,2.492094,2.328921,31.985578,...,3.152202,2.225084,3.615761,3.523049,37.919135,18.913211,647.166667,74.678571,42.216503,10
12929,847516884757948478975,2023,Marchment-Seguin-Duchene,DAL,64,39783.0,0.51,2.866752,2.703868,29.77151,...,3.800618,1.900309,3.438655,2.624236,38.458638,18.460146,663.05,64.0,39.813276,16
9051,847351284808018482116,2022,Tkachuk-Giroux-Sttzle,OTT,66,30404.0,0.6,3.908565,2.617945,36.232075,...,3.552164,2.486515,4.381002,4.381002,46.178134,28.535719,506.733333,66.0,39.349225,19
6717,847804784784838479318,2021,Bunting-Matthews-Marner,TOR,51,33280.0,0.65,3.867188,2.08774,40.240385,...,6.598558,3.353365,3.786058,3.786058,38.942308,22.175481,554.666667,51.0,38.469994,26
9311,847167584764838477404,2022,Guentzel-Crosby-Rakell,PIT,62,33145.0,0.58,3.848182,2.807663,36.928647,...,2.823955,2.280887,3.475637,3.584251,38.883693,24.763916,552.416667,62.0,37.085617,29
8763,847063884734198478498,2022,Debrusk-Bergeron-Marchand,BOS,51,24783.0,0.63,3.669289,2.174555,41.835129,...,3.341,1.59787,3.631522,4.938869,51.712868,21.78913,413.05,51.0,35.521709,32
10109,847167584758108477404,2022,Guentzel-Crosby-Rust,PIT,58,27674.0,0.53,3.347113,2.991978,36.554166,...,3.25215,2.60172,3.25215,3.25215,40.847004,24.326082,461.233333,58.0,34.686119,41
9041,847415084763998478233,2022,Mangiapane-Backlund-Coleman,CGY,47,23912.0,0.68,4.338909,2.064068,43.057879,...,3.161593,1.204416,3.161593,3.462697,36.885246,23.78722,398.533333,47.0,33.743199,47
