# Data Exploration

Variable exploration, creation, deletion.

In [1]:
import pandas as pd
import sys
import importlib
import numpy as np
import data_prep, data_proc, data_explor

In [2]:
importlib.reload(data_prep)
importlib.reload(data_explor)
importlib.reload(data_proc)

<module 'data_proc' from '/Users/nickdimmitt/dfs/hockey/data_proc.py'>

In [3]:
df_skaters = data_prep.read_data("/Users/nickdimmitt/dfs/hockey/data/skate_misc_shot_merged.csv")

In [4]:
column_drop = 'Unnamed: 0, goals_y, shots_y, gameDate_y, shootingPct_y, gamesPlayed_y, homeRoad_y, lastName_y, opponentTeamAbbrev_y, otGoals_y, positionCode_y, shootsCatches_y, skaterFullName_y, teamAbbrev_y, timeOnIcePerGame_y'.split(", ")

In [5]:
df_skaters = data_prep.remove_columns(df_skaters, column_drop)

In [6]:
df_skaters.columns = df_skaters.columns.str.rstrip('_x')

In [7]:
df_skaters = data_prep.remove_dup_col(df_skaters)

In [None]:
df_skaters.columns

### Variable Creation

In [8]:
## fantasy points
df_skaters['fan_points'] = data_explor.fan_points(df_skaters)

In [9]:
## fantasy points above or below the mean
df_skaters['performance'] = data_explor.overperform(df_skaters, 'fan_points', 'playerId')

In [10]:
## performance dummy, predictor variable
df_skaters['over_perf_dummy'] = data_explor.over_perf_dummy(df_skaters, 'performance')
df_skaters['under_perf_dummy'] = data_explor.under_perf_dummy(df_skaters, 'performance')
df_skaters['same_perf_dummy'] = data_explor.same_perf_dummy(df_skaters, 'performance')

In [11]:
## home away dummy
df_skaters['homeRoad_perf'] = data_explor.home_away_perf(df_skaters, 'performance', ['playerId', 'homeRoad'])

In [53]:
better_home_skater = list(np.where((df_skaters['homeRoad'] == 'H') & (df_skaters['homeRoad_perf'] > 0), df_skaters['playerId'], None))
better_away_skater = list(np.where((df_skaters['homeRoad'] == 'R') & (df_skaters['homeRoad_perf'] > 0), df_skaters['playerId'], None))

In [54]:
better_home_skater = [*set(better_home_skater)]
better_away_skater = [*set(better_away_skater)]

1539
931


In [60]:
df_skaters['better_home_dummy'] = np.where(df_skaters['playerId'].isin(better_home_skater), 1, 0)
df_skaters['better_away_dummy'] = np.where(df_skaters['playerId'].isin(better_away_skater), 1, 0)
df_skaters['home_away_no_change'] = np.where((df_skaters['better_home_dummy'] == 0) & (df_skaters['better_away_dummy'] == 0), 1, 0)

In [38]:
df_skaters['goals_per_60'] = data_explor.stat_per_60(df_skaters, 'timeOnIcePerGame', 'goals')

In [42]:
df_skaters['assists_per_60'] = data_explor.stat_per_60(df_skaters, 'timeOnIcePerGame', 'assists')

### Splitting Dataset by Home Away Split

In [64]:
df_skaters_home = df_skaters[df_skaters['better_home_dummy'] == 1]
df_skaters_away = df_skaters[df_skaters['better_home_dummy'] == 0]

In [67]:
df_skaters_home.shape

(409942, 80)

In [68]:
df_skaters_away.shape

(150679, 80)

In [1]:
drop_cols = 'shootingPctMa7 shootingPctMa3 shootingPctLastGame shootingPctMa10 shootingPctMa14 emptyNetAssists emptyNetPoints faceoffWinPct shootingPctBackhand shootingPctDeflected shootingPctSlap shootingPctSnap shootingPctTipIn shootingPctWrapAround shootingPctWrist'.split()
drop_cols.append('Unnamed: 0')
drop_cols

['shootingPctMa7',
 'shootingPctMa3',
 'shootingPctLastGame',
 'shootingPctMa10',
 'shootingPctMa14',
 'emptyNetAssists',
 'emptyNetPoints',
 'faceoffWinPct',
 'shootingPctBackhand',
 'shootingPctDeflected',
 'shootingPctSlap',
 'shootingPctSnap',
 'shootingPctTipIn',
 'shootingPctWrapAround',
 'shootingPctWrist',
 'Unnamed: 0']

In [None]:
impute_by_player = 'hitsPer60 blockedShotsPer60 giveawaysPer60 takeawaysPer60 assistsMa3 assistsMa7 assistsMa10 assistsMa14 assistsLastGame goalsMa3 goalsMa7 goalsMa10 goalsMa14 goalsLastGame plusMinusMa3 plusMinusMa7 plusMinusMa10 plusMinusMa14 plusMinusLastGame pointsMa3 pointsMa7 pointsMa10 pointsMa14 pointsLastGame fanPointsMa3 fanPointsMa7 fanPointsMa10 fanPointsMa14 fanPointsLastGame'.split()

In [None]:
'goalsBackhand goalsDeflected goalsSlap goalsSnap goalsTipIn goalsWrapAround goalsWrist shotsOnNetBackhand shotsOnNetDeflected shotsOnNetSlap shotsOnNetSnap shotsOnNetTipIn shotsWrapAround shotsOnNetWrist'.split()

In [9]:
df_goalie = pd.read_csv('data/df_goalies.csv')

In [10]:
df_goalie.head()

Unnamed: 0.1,Unnamed: 0,assists,gameDate,gameId,gamesPlayed,gamesStarted,goalieFullName,goals,goalsAgainst,goalsAgainstAverage,...,points,savePct,saves,shootsCatches,shotsAgainst,shutouts,teamAbbrev,ties,timeOnIce,wins
0,0,0,2022-10-21,2022020072,1,0,Alex Stalock,0,0,0.0,...,0,1.0,10,L,10,0,CHI,,1336,1
1,1,0,2022-10-23,2022020091,1,1,James Reimer,0,0,0.0,...,0,1.0,30,L,30,1,SJS,,3600,1
2,2,0,2022-10-21,2022020073,1,0,Martin Jones,0,0,0.0,...,0,1.0,1,L,1,0,SEA,,583,1
3,3,0,2022-10-29,2022020136,1,1,Darcy Kuemper,0,0,0.0,...,0,1.0,34,L,34,1,WSH,,3586,1
4,4,0,2022-10-22,2022020077,1,1,Jordan Binnington,0,0,0.0,...,0,1.0,23,L,23,1,STL,,3600,1


In [11]:
df_goalie.columns

Index(['Unnamed: 0', 'assists', 'gameDate', 'gameId', 'gamesPlayed',
       'gamesStarted', 'goalieFullName', 'goals', 'goalsAgainst',
       'goalsAgainstAverage', 'homeRoad', 'lastName', 'losses',
       'opponentTeamAbbrev', 'otLosses', 'penaltyMinutes', 'playerId',
       'points', 'savePct', 'saves', 'shootsCatches', 'shotsAgainst',
       'shutouts', 'teamAbbrev', 'ties', 'timeOnIce', 'wins'],
      dtype='object')

In [12]:
df_toi = pd.read_csv('data/df_skaters_toi.csv')

In [13]:
df_toi.head()

Unnamed: 0.1,Unnamed: 0,evTimeOnIce,evTimeOnIcePerGame,gameDate,gameId,gamesPlayed,homeRoad,lastName,opponentTeamAbbrev,otTimeOnIce,...,shTimeOnIce,shTimeOnIcePerGame,shifts,shiftsPerGame,shootsCatches,skaterFullName,teamAbbrev,timeOnIce,timeOnIcePerGame,timeOnIcePerShift
0,0,1147,1147.0,2022-10-18,2022020052,1,R,Doughty,NSH,75.0,...,235,235.0,26,26.0,R,Drew Doughty,LAK,1812,1812.0,69.6923
1,1,1490,1490.0,2022-11-01,2022020147,1,R,Lindholm,PIT,67.0,...,177,177.0,37,37.0,L,Hampus Lindholm,BOS,1787,1787.0,48.29729
2,2,1561,1561.0,2022-10-22,2022020083,1,H,Josi,PHI,0.0,...,14,14.0,30,30.0,L,Roman Josi,NSH,1785,1785.0,59.5
3,3,1598,1598.0,2022-10-29,2022020128,1,R,Chabot,FLA,0.0,...,25,25.0,25,25.0,L,Thomas Chabot,OTT,1756,1756.0,70.24
4,4,1467,1467.0,2022-10-22,2022020079,1,R,Doughty,WSH,0.0,...,159,159.0,29,29.0,R,Drew Doughty,LAK,1711,1711.0,59.0


In [14]:
df_toi.columns

Index(['Unnamed: 0', 'evTimeOnIce', 'evTimeOnIcePerGame', 'gameDate', 'gameId',
       'gamesPlayed', 'homeRoad', 'lastName', 'opponentTeamAbbrev',
       'otTimeOnIce', 'otTimeOnIcePerOtGame', 'playerId', 'positionCode',
       'ppTimeOnIce', 'ppTimeOnIcePerGame', 'shTimeOnIce',
       'shTimeOnIcePerGame', 'shifts', 'shiftsPerGame', 'shootsCatches',
       'skaterFullName', 'teamAbbrev', 'timeOnIce', 'timeOnIcePerGame',
       'timeOnIcePerShift'],
      dtype='object')