In [1]:
import pandas as pd
from sqlalchemy import create_engine
import getpass

In [2]:
p = getpass.getpass(prompt="Password: ")
rds_connection_string = f"postgres:{p}@localhost:5432/Hockey_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

Password: ········


## Game Info

In [3]:
filepath1 = "Resources/game.csv"
game_df = pd.read_csv(filepath1)

In [4]:
game_df = game_df[["game_id", "season", "type", "away_team_id", "home_team_id", "outcome"]]

In [5]:
game_df = game_df.loc[game_df["type"] == "R"]

In [6]:
game_df = game_df.loc[game_df["season"] == 20172018]

In [7]:
game_df.to_csv("Output/game_info_df.csv", index=False, header=True)

## Player Info

In [8]:
filepath2 = "Resources/player_info.csv"
player_info_df = pd.read_csv(filepath2, sep="|")

In [9]:
player_info_df = player_info_df.assign(Name = player_info_df.firstName.astype(str) + " " +
                                                       player_info_df.lastName.astype(str))

In [10]:
player_info_df = player_info_df[["player_id", "Name", "primaryPosition"]]

In [11]:
player_info_df.to_csv("Output/player_info_df.csv", index=False, header=True, sep=",")

In [12]:
player_info_df.rename(columns={"Name":"name","primaryPosition":"primary_position"}).to_sql('players', con=engine, if_exists="append", index=False)

## Team Info

In [13]:
filepath3 = "Resources/team_info.csv"
team_info_df = pd.read_csv(filepath3)

In [14]:
team_info_df = team_info_df.assign(Team = team_info_df.shortName.astype(str) + " " +
                                                       team_info_df.teamName.astype(str))

In [15]:
team_info_df = team_info_df[["team_id", "Team"]]

In [16]:
team_info_df = team_info_df.drop([18, 32])

In [17]:
team_info_df["Team"] = team_info_df["Team"].replace({"NY Rangers Rangers": "New York Rangers"
                                                   , "NY Islanders Islanders": "New York Islanders"})

In [18]:
team_info_df.to_csv("Output/team_info_df.csv", index=False, header=True)

In [19]:
team_info_df.rename(columns={"Team":"team"}).to_sql('teams', con=engine, if_exists="append", index=False)

## Merge game_info with team_info

In [20]:
new_game_df = pd.merge(game_df, team_info_df, left_on="away_team_id", right_on="team_id", how="left")

In [21]:
new_game_df = pd.merge(new_game_df, team_info_df, left_on="home_team_id", right_on="team_id", how="left")

In [22]:
new_game_df = new_game_df.rename(columns={"Team_x": "Away Team", "Team_y": "Home Team"})

In [23]:
new_game_df = new_game_df[["game_id", "season", "home_team_id", "Home Team", "away_team_id" , "Away Team", "outcome"]]

In [24]:
new_game_df.to_csv("Output/new_game_df.csv", index=False, header=True)

In [25]:
new_game_df.rename(columns={"Home Team":"home_team","Away Team":"away_team"}).to_sql(
    'games', con=engine, if_exists="append", index=False)

## Skater Stats

In [26]:
filepath4 = "Resources/game_skater_stats.csv"
skater_info_df = pd.read_csv(filepath4, sep="|")

In [27]:
season_skater_df = skater_info_df.loc[skater_info_df.game_id.isin(new_game_df.game_id)]

In [28]:
season_skater_df = season_skater_df.reset_index()[['game_id','player_id','team_id','penaltyMinutes']]

In [29]:
season_skater_df.to_csv("Output/skater_info_df.csv", index=False, header=True, sep=",")

In [30]:
season_skater_df.rename(columns={"penaltyMinutes":"penalty_minutes"}).to_sql(
    "skater_stats", con=engine, if_exists="append", index=False)

## Analysis - PIM (Penalty in Minutes) by Position per Game

In [31]:
# Combine Skater states with player info
player_stats_df = season_skater_df.merge(player_info_df,on="player_id",how="inner")

# Combine LW (left Wing) and RW (Right Wing) into W (Wing)
player_stats_df['primaryPosition'] = player_stats_df['primaryPosition'].replace({"LW":"W","RW":"W"})

# Aggregate total PIM by position for each game
game_position_pim = player_stats_df[['game_id','primaryPosition','penaltyMinutes']].groupby(
    ['game_id','primaryPosition',]).sum()

# Reset Index
game_position_pim.reset_index(inplace=True)

# Average PIM by Position per game
avg_position_pim = game_position_pim[['primaryPosition','penaltyMinutes']].groupby(
    ['primaryPosition']).mean().rename(columns={'penaltyMinutes':'Avg PIM by Position/Game'})

# Aggregate total PIM by position for each team and game
team_game_position_pim = player_stats_df[['game_id','team_id','primaryPosition','penaltyMinutes']].groupby(
    ['game_id','team_id','primaryPosition',]).sum()

# Reset Index
team_game_position_pim.reset_index(inplace=True)

# Calculate average PIM by Position
game_position_pim = team_game_position_pim.merge(avg_position_pim, on="primaryPosition")

In [33]:
# Combine PIM by Position per game with Game data by Home Team
game_results = new_game_df[['game_id','home_team_id','Home Team','away_team_id','Away Team','outcome']].merge(
    game_position_pim.rename(columns={'primaryPosition':'Home Team Position','penaltyMinutes':'Home Team PIM'}),
    left_on=['game_id','home_team_id'], right_on=['game_id','team_id'])

# Combine PIM by Position per game with Game data by Away Team 
game_results = game_results.merge(
    game_position_pim.rename(columns={'primaryPosition':'Away Team Position','penaltyMinutes':'Away Team PIM'}),
    left_on=['game_id','away_team_id','Home Team Position'], right_on=['game_id','team_id','Away Team Position'])

# Clean up results
game_results = game_results[['game_id','home_team_id','Home Team','Home Team Position','Home Team PIM',
              'away_team_id','Away Team','Away Team Position','Away Team PIM','outcome',
              'Avg PIM by Position/Game_x']].rename(columns={'Avg PIM by Position/Game_x':'Avg PIM by Position/Game'})

In [34]:
game_results

Unnamed: 0,game_id,home_team_id,Home Team,Home Team Position,Home Team PIM,away_team_id,Away Team,Away Team Position,Away Team PIM,outcome,Avg PIM by Position/Game
0,2017020812,7,Buffalo Sabres,C,0,24,Anaheim Ducks,C,2,away win OT,4.479150
1,2017020812,7,Buffalo Sabres,D,4,24,Anaheim Ducks,D,2,away win OT,6.016522
2,2017020812,7,Buffalo Sabres,W,4,24,Anaheim Ducks,W,2,away win OT,6.182533
3,2017020586,24,Anaheim Ducks,C,7,20,Calgary Flames,C,8,home win REG,4.479150
4,2017020586,24,Anaheim Ducks,D,6,20,Calgary Flames,D,0,home win REG,6.016522
...,...,...,...,...,...,...,...,...,...,...,...
3808,2017020704,26,Los Angeles Kings,D,2,5,Pittsburgh Penguins,D,4,away win REG,6.016522
3809,2017020704,26,Los Angeles Kings,W,19,5,Pittsburgh Penguins,W,0,away win REG,6.182533
3810,2017020726,10,Toronto Maple Leafs,C,0,21,Colorado Avalanche,C,2,away win REG,4.479150
3811,2017020726,10,Toronto Maple Leafs,D,0,21,Colorado Avalanche,D,2,away win REG,6.016522
