# Preprocessing Pipeline

All the following cells can be run to create the dataframe needed for the modelling phase.
The output file will be opened in the Modelling notebook.

### Import Packages

In [1]:
import numpy as np
import pickle
import pandas as pd
from functions.preprocessing_functions import *

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Load Data

In [3]:
advanced_stats = pickle.load(open('../Data/full_advanced_stats', 'rb'))
game = pd.read_csv('../Data/game.csv')
game_teams_stats = pd.read_csv('../Data/game_teams_stats.csv')
team_info = pd.read_csv('../Data/team_info.csv')
game_goalie_stats = pd.read_csv('../Data/game_goalie_stats.csv')
game_shifts = pd.read_csv('../Data/game_shifts.csv')

### Amalgamate data into one main dataframe

In [4]:
# Remove older seasons from data
# Pass list of seasons to keep
df_game = select_seasons([20152016],game)

In [5]:
# Remove bug
to_delete = df_game[(df_game['game_id']==2017020172)].index
df_game = df_game.drop(to_delete)

# Prepare advanced stats
advanced_stats = advanced_stats.reset_index()

# Remove playoff games
playoff_games = df_game.loc[(df_game['type']=='P')].index
df_game = df_game.drop(playoff_games)

# Get unique team IDs
unique_ids = df_game.away_team_id.unique()

# Create target variable for modelling
df_game['target'] = df_game['outcome'].apply(lambda x: get_outcome(x))

# Combine games data with advanced stats
df_game = df_game.merge(advanced_stats, left_on='game_id', right_on='index', how='left')

# Clean game stats data
game_teams_stats_cleaned = combine_home_away(game_teams_stats)

# Drop columns before merge
game_teams_stats_cleaned = game_teams_stats_cleaned.drop(columns=['away_team_id','home_team_id','away_goals','home_goals'])

# Merge game stats with main dataframe
df_game = pd.merge(df_game, game_teams_stats_cleaned, on='game_id')

# Add dates to goalie dataframe
game_goalie_dates = add_dates(game, game_goalie_stats)

# get starting goalies
starting_goaltenders = get_starting_goalies(df_game, game_goalie_dates, game_shifts)

In [6]:
# get rolling goalie stats
main_df = goalie_rolling_stats(5, starting_goaltenders, game_goalie_dates)
main_df = goalie_rolling_stats(10, main_df, game_goalie_dates)
main_df = goalie_rolling_stats(40, main_df, game_goalie_dates)

# Create differentials
main_df = get_goalie_differentials(main_df)

# get head to head stats
main_df = get_head_2_head(main_df, 10)
main_df = get_head_2_head(main_df, 5)
main_df = get_head_2_head(main_df, 2)

In [7]:
# Get win pct feature
team_win_pct = get_win_pct(main_df, unique_ids, 10)
team_win_pct = get_win_pct(team_win_pct, unique_ids, 40)
team_win_pct = get_win_pct(team_win_pct, unique_ids, 82)

# Create Differentials
merged_df = get_win_differentials(team_win_pct)

# Get advanced Corsi/Fenwick Stats
merged_df = get_rolling_averages_corsi_fenwick(merged_df, unique_ids, 3)
merged_df = get_rolling_averages_corsi_fenwick(merged_df, unique_ids, 10)
merged_df = get_rolling_averages_corsi_fenwick(merged_df, unique_ids, 40)

# Create Differentials
merged_df = get_corsi_differentials(merged_df)

# Get basic game stats
merged_df = get_avg_stats(merged_df, unique_ids, 5)
merged_df = get_avg_stats(merged_df, unique_ids, 40)
merged_df = get_avg_stats(merged_df, unique_ids, 82)

# Create Differentials
merged_df = get_stats_differentials(merged_df)

In [8]:
# Select columns
nhl_data = select_preprocessing_columns(merged_df)

In [10]:
# Drop Null Values
nhl_data = nhl_data.dropna()

In [30]:
nhl_data.to_csv('../Data/preprocessed_df.csv')