# Data Extraction

In [1]:
import numpy as np
import pandas as pd
import requests
import os
import json
from progressbar import progressbar as progbar

os.chdir("nhl/")

import nhl
from nhl.team import Team
from nhl.game import Game

os.chdir("..")

base_url='https://statsapi.web.nhl.com/api/v1'

## Collect All Game IDs For a Given Season
This will take like 20 seconds

In [2]:
# set these variables to filter query
season = None           # what season to query - None defaults to current
include_pre = False     # whether to include preseason games
include_post = False    # whether to include postseason games
include_future = False  # whether to include unplayed games

team_ids = nhl.api.getTeamIDs()
game_ids = set()
for team in list(team_ids.values()):
    ids = nhl.api.getGameIDs(team, season=season, include_pre=include_pre,
                             include_post=include_post, include_future=include_future)
    game_ids.update(ids)

## Collect All Live Game Data For Every Team
This will take a handful of minutes

In [3]:
hits = []
penalties = []
shots = []
turnovers = []
agg_stats = []

for id_ in progbar(game_ids):
    # create a game object for the given game
    game = Game(id_)
    # extract game data
    game.makeDataFrames()
    
    hits.append(game.hit_data)
    penalties.append(game.penalty_data)
    shots.append(game.shot_data)
    turnovers.append(game.turnover_data)
    
    agg_stats.append(game.agg_stats)
        
hits_df = pd.concat(hits)
penalties_df = pd.concat(penalties)
shots_df = pd.concat(shots)
turnovers_df = pd.concat(turnovers)
game_stats = pd.concat(agg_stats)

100% (1082 of 1082) |####################| Elapsed Time: 0:06:15 Time:  0:06:15


In [3]:
agg_stats = []
for id_ in progbar(game_ids):
    # create a game object for the given game
    game = Game(id_)
    agg_stats.append(game.agg_stats)
    
game_stats = pd.concat(agg_stats)

100% (1082 of 1082) |####################| Elapsed Time: 0:05:46 Time:  0:05:46


## Save "Raw" Data
This will take a handful of seconds

In [4]:
hits_df.to_csv('data/all_hits.csv')
penalties_df.to_csv('data/all_penalties.csv')
shots_df.to_csv('data/all_shots.csv')
turnovers_df.to_csv('data/all_turnovers.csv')

## Filter and Save by Team

In [4]:
team_codes = {'ANA', 'ARI', 'BOS', 'BUF', 'CAR', 'CBJ', 'CGY', 
              'CHI', 'COL', 'DAL', 'DET', 'EDM', 'FLA', 'LAK', 
              'MIN', 'MTL', 'NJD', 'NSH', 'NYI', 'NYR', 'OTT',
              'PHI', 'PIT', 'SJS', 'STL', 'TBL', 'TOR', 'VAN', 
              'VGK', 'WPG', 'WSH'}

In [5]:
for team in team_codes:
    game_stats[game_stats.team==team].to_csv(f'data/by_team/{team}/{team}_game_time-series.csv')

In [6]:
for team in team_codes:
    hits_inds =  (hits_df.home_team==team) | (hits_df.away_team==team)
    penalties_inds =  (penalties_df.home_team==team) | (penalties_df.away_team==team)
    shots_inds =  (shots_df.home_team==team) | (shots_df.away_team==team)
    turnovers_inds =  (turnovers_df.home_team==team) | (turnovers_df.away_team==team)
    
    team_hits = hits_df[hits_inds]
    team_penalties = penalties_df[penalties_inds]
    team_shots = shots_df[shots_inds]
    team_turnovers = turnovers_df[turnovers_inds]
    
    game_stats[game_stats.team==team].to_csv(f'data/by_team/{team}/{team}_game_time-series.csv')
    
    team_hits.to_csv(f'data/by_team/{team}/{team}_hits.csv')
    team_penalties.to_csv(f'data/by_team/{team}/{team}_penalties.csv')
    team_shots.to_csv(f'data/by_team/{team}/{team}_shots.csv')
    team_turnovers.to_csv(f'data/by_team/{team}/{team}_turnovers.csv')