# <font color=teal>imports</font>

In [1]:
import os
import sys
sys.path.append(os.path.abspath("../src"))

In [2]:
from src import *


# <font color=teal>housekeeping</font>

In [3]:
db = database_loader.DatabaseLoader(get_config('connection_string'))
DEBUG=False

# <font color=teal>next gen stats passing<font/>
group by <font color=red>season, week, team</font> ( and top-passing-player_position )

In [4]:
%%time
ngs_air_power = db.query_to_df("""
    with base as (
    select season, week, team_abbr as team,
           pass_touchdowns,
           avg_time_to_throw,
           avg_completed_air_yards,
           avg_intended_air_yards,
           avg_air_yards_differential,
           aggressiveness,
           max_completed_air_distance,
           avg_air_yards_to_sticks,
           attempts,
           pass_yards,
           pass_touchdowns,
           interceptions,
           passer_rating,
           completions,
           completion_percentage,
           expected_completion_percentage,
           completion_percentage_above_expectation,
           avg_air_distance,
           max_air_distance,
        row_number() over (partition by season, week, team_abbr, player_position order by pass_yards desc) as rn
    from controls.nextgen_pass
--    where season=2016 and week=1 and team_abbr = 'CHI'
    order by team_abbr, player_position, season desc, week )
    select * from base where rn = 1 and week > 0
""")

ngs_air_power.head()


CPU times: user 62.3 ms, sys: 8.8 ms, total: 71.1 ms
Wall time: 160 ms


Unnamed: 0,season,week,team,pass_touchdowns,avg_time_to_throw,avg_completed_air_yards,avg_intended_air_yards,avg_air_yards_differential,aggressiveness,max_completed_air_distance,...,pass_touchdowns.1,interceptions,passer_rating,completions,completion_percentage,expected_completion_percentage,completion_percentage_above_expectation,avg_air_distance,max_air_distance,rn
0,2022,1,ARI,2,2.803059,3.766818,7.508125,-3.741307,14.705882,38.423006,...,2,0,99.264706,22,64.705882,63.872274,0.833608,20.780448,55.361811,1
1,2022,2,ARI,1,2.631918,3.560323,5.360213,-1.79989,16.326531,38.988774,...,1,1,76.658163,31,63.265306,67.953796,-4.68849,17.729007,39.69762,1
2,2022,3,ARI,0,2.432569,3.848108,5.682364,-1.834256,8.62069,37.932599,...,0,0,77.801724,37,63.793103,72.002558,-8.209454,18.99306,51.283168,1
3,2022,4,ARI,2,2.733469,4.967391,5.964839,-0.997447,3.125,38.072149,...,2,1,96.744792,23,71.875,67.395659,4.479341,20.337519,44.22631,1
4,2022,5,ARI,1,2.68195,3.378571,6.814474,-3.435902,11.904762,28.25543,...,1,1,80.456349,28,66.666667,63.948053,2.718614,20.096993,47.951656,1


## <font color=teal>next gen stats rushing<font/>
group by <font color=red>season, week, team</font>

In [5]:
%%time
ngs_ground_power = db.query_to_df("""
with base as (
    select season, week, team_abbr as team, rush_yards,
           efficiency,
           percent_attempts_gte_eight_defenders,
           avg_time_to_los,
           rush_attempts,
           rush_yards,
           expected_rush_yards,
           rush_yards_over_expected,
           avg_rush_yards,
           rush_yards_over_expected_per_att,
           rush_pct_over_expected,
           rush_touchdowns,
           player_gsis_id,
           player_first_name,
           player_last_name,
           player_jersey_number,
           player_short_name,
           row_number() over (partition by season, week, team_abbr order by rush_yards desc) as rn
    from controls.nextgen_rush
    order by  team_abbr, season desc, week)
select * from base where week > 0
""" )

ngs_ground_power.head()

CPU times: user 14.1 ms, sys: 2.36 ms, total: 16.5 ms
Wall time: 30.9 ms


Unnamed: 0,season,week,team,rush_yards,efficiency,percent_attempts_gte_eight_defenders,avg_time_to_los,rush_attempts,rush_yards.1,expected_rush_yards,...,avg_rush_yards,rush_yards_over_expected_per_att,rush_pct_over_expected,rush_touchdowns,player_gsis_id,player_first_name,player_last_name,player_jersey_number,player_short_name,rn
0,2022,1,ARI,26,5.930769,10.0,2.445889,10,26,35.285705,...,2.6,-0.92857,0.4,1,00-0033553,James,Conner,6,J.Conner,1
1,2022,3,ARI,39,5.421026,7.692308,2.580545,13,39,56.352659,...,3.0,-1.33482,0.230769,0,00-0033553,James,Conner,6,J.Conner,1
2,2022,4,ARI,55,3.860545,40.0,2.565267,15,55,44.522278,...,3.666667,0.534123,0.642857,0,00-0033553,James,Conner,6,J.Conner,1
3,2022,6,ARI,37,5.610541,6.666667,2.708,15,37,53.014218,...,2.466667,-1.215301,0.285714,0,00-0036383,Eno,Benjamin,26,E.Benjamin,1
4,2022,7,ARI,92,2.619457,8.333333,2.810818,12,92,44.221202,...,7.666667,3.525345,0.545455,1,00-0036383,Eno,Benjamin,26,E.Benjamin,1


# <font color=teal>play-by-play events<font/>
players are called out for certain events like fumbles, touchdowns, etc. in play-by-play
we already picked these out during the transform step,
  and expanded so that each team has its own records irrespective of the opposing team played.
Now we pivot and sum all events by  <font color=red>season, week, team</font>

In [6]:
%%time
pbp_events = db.query_to_df("""
SELECT
    season, week, team,
    SUM(CASE WHEN event = 'fumble' THEN 1 else 0 END) AS fumble,
    SUM(CASE WHEN event = 'own_kickoff_recovery' THEN 1 else 0 END) AS own_kickoff_recovery,
    SUM(CASE WHEN event = 'safety' THEN 1 else 0 END) AS safety,
    SUM(CASE WHEN event = 'tackle' THEN 1 else 0 END) AS tackle,
    SUM(CASE WHEN event = 'qb_hit' THEN 1 else 0  END) AS qb_hit,
    SUM(CASE WHEN event = 'touchdown' THEN 1  else 0 END) AS touchdown,
    SUM(CASE WHEN event = 'interception' THEN 1 else 0 END) AS interception,
    SUM(CASE WHEN event = 'sack' THEN 1 else 0 END) AS sack
FROM controls.player_events where week > 0
group by season, week, team
order by season desc, team, week
""")

pbp_events.head()

CPU times: user 7.76 ms, sys: 1.34 ms, total: 9.1 ms
Wall time: 63.7 ms


Unnamed: 0,season,week,team,fumble,own_kickoff_recovery,safety,tackle,qb_hit,touchdown,interception,sack
0,2022,1,ARI,6,0,0,75,6,3,0,0
1,2022,2,ARI,2,0,0,61,5,4,0,1
2,2022,3,ARI,2,0,0,54,5,0,0,1
3,2022,4,ARI,3,0,0,57,3,3,2,2
4,2022,5,ARI,0,0,0,86,6,2,0,3


# <font color=teal>Player stats<font/>
Each player's stats by are collected by game and play
For this dimension reduction exercise we roll up to <font color=red>season, week, team</font>

In [7]:
%%time
player_stats = db.query_to_df("""
select
    season,
    week,
    team,
    sum(completions) as ps_completions,
    sum(attempts) as ps_attempts,
    sum(passing_yards) as passing_yards,
    sum(passing_tds) as passing_tds,
    sum(interceptions) as ps_interceptions,
    sum(sacks) as sacks,
    sum(sack_yards) as sack_yards,
    sum(sack_fumbles) as sack_fumbles,
    sum(sack_fumbles_lost) as sack_fumbles_lost,
    sum(passing_air_yards) as passing_air_yards,
    sum(passing_yards_after_catch) as passing_yards_after_catch,
    sum(passing_first_downs) as passing_first_downs,
    avg(passing_epa) as passing_epa,
    sum(passing_2pt_conversions) as passing_2pt_conversions,
    avg(pacr) as avg_pacr,
    avg(dakota) as avg_dakota,
    sum(carries) as carries,
    sum(rushing_yards) as rushing_yards,
    sum(rushing_tds) as rushing_tds,
    sum(rushing_fumbles) as rushing_fumbles,
    sum(rushing_fumbles_lost) as rushing_fumbles_lost,
    sum(rushing_first_downs) as rushing_first_downs,
    avg(rushing_epa) as avg_rushing_epa,
    sum(rushing_2pt_conversions) as rushing_2pt_conversions,
    sum(receptions) as receptions,
    sum(targets) as targets,
    sum(receiving_yards) as receiving_yards,
    sum(receiving_tds) as receiving_tds,
    sum(receiving_fumbles) as receiving_fumbles,
    sum(receiving_fumbles_lost) as receiving_fumbles_lost,
    sum(receiving_air_yards) as receiving_air_yards,
    sum(receiving_yards_after_catch) as receiving_yards_after_catch,
    sum(receiving_first_downs) as receiving_first_downs,
    avg(receiving_epa) as avg_receiving_epa,
    sum(receiving_2pt_conversions) as receiving_2pt_conversions,
    sum(racr) as racr,
    sum(target_share) as target_share,
    sum(air_yards_share) as air_yards_share,
    sum(wopr) as wopr,
    sum(special_teams_tds) as special_teams_tds
from controls.player_stats
group by season,
week,
team
order by season desc, team,  week
""")

player_stats.head()

CPU times: user 113 ms, sys: 20.3 ms, total: 133 ms
Wall time: 462 ms


Unnamed: 0,season,week,team,ps_completions,ps_attempts,passing_yards,passing_tds,ps_interceptions,sacks,sack_yards,...,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,avg_receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds
0,2022,1,ARI,24,38,205,2,0,3,26,...,266,115,12,0.680728,1,23.36395,1.0,1.0,2.2,0
1,2022,2,ARI,31,49,277,1,1,1,7,...,239,162,13,-0.302323,1,4.044793,1.0,1.0,2.2,0
2,2022,3,ARI,37,58,314,0,0,2,19,...,332,166,15,0.12071,0,-14.124653,1.0,1.0,2.2,0
3,2022,4,ARI,23,32,207,2,1,1,1,...,182,87,10,0.059163,0,1.130505,1.0,1.0,2.2,0
4,2022,5,ARI,28,42,250,1,1,1,11,...,293,144,16,1.347603,0,-14.683343,1.0,1.0,2.2,0


# <font color=teal>play by play analytics</font>
Analytics are part of the play-by-play dataset - they are collected for each play in each game.

For this dimensionality reduction step we roll them up to the player stats level.

The stats we get are for each play, and those probabilities
  and play level incrementals like WPA and EPA don't make sense in a rollup like this (I think)
  so for this rollup we'll use just EP and WP and well take the first and last metric form each season, week, team groping

We'll also separate home and away teams into their own set, so that for each game there will be two separate sets of analytics, one for the home team and one fr the away team.

### <font color="#9370DB">helper functions</font>

In [8]:
from typing import List, NamedTuple

class Col(NamedTuple):
    name: str
    alias: str

def build_pivot_sql(team_col: str,  pivot_cols: List[Col], additional_cols: List[Col]):
    db_table = 'controls.play_analytics'

    base_cols = ['season',
                 'week']
    metrics = []
    for col in pivot_cols:
        metrics.append(
            f"""
            MAX(CASE WHEN RN = 1 THEN {col.alias} END) AS start_{col.alias},
            MAX(CASE WHEN RN = (total_rows/2) THEN {col.alias} END) AS half_{col.alias}""")

    inner_cols = base_cols +  [f"{team_col} as team "] + [f"{p.name} as {p.alias}" for p in pivot_cols] + [f"{p.name} as {p.alias}" for p in additional_cols]
    inner_select = ",".join(inner_cols)

    outer_cols = base_cols + ["team"] +  [f"{p.alias}" for p in additional_cols] + metrics
    outer_select = ",".join(outer_cols)
    outer_group = ",".join(base_cols + ["team"] +  [f"{p.alias}" for p in additional_cols])

    station_cols = base_cols + [team_col]
    stations = ",".join(station_cols)

    sql = f"""
    WITH ranked_rows AS (
        SELECT {inner_select},
             ROW_NUMBER() OVER (PARTITION BY
                {stations} ORDER BY play_counter) AS RN,
             COUNT(*) OVER (PARTITION BY
                {stations}) AS total_rows
        FROM {db_table}
    )
    SELECT
        {outer_select}
    FROM ranked_rows
    GROUP BY {outer_group}
    order by season desc, team, week
    """

    if DEBUG:
        print(sql)
    return sql

def build_pivot(team_col: str,  pivot_cols: List[Col],  additional_cols: List[Col]):
    sql = build_pivot_sql(team_col, pivot_cols, additional_cols)
    df = db.query_to_df( sql )
    return df


### <font color="#9370DB">home team statistics</font>

In [9]:
%%time
pivot_cols = [
    Col('home_wp', "team_wp"),
    Col('away_wp', "opponent_wp"),
]

home_analytics_df = build_pivot(team_col="home_team", pivot_cols=pivot_cols, additional_cols=[Col("away_team", "opponent")])
home_analytics_df.head()

CPU times: user 3.33 ms, sys: 1.14 ms, total: 4.46 ms
Wall time: 745 ms


Unnamed: 0,season,week,team,opponent,start_team_wp,half_team_wp,start_opponent_wp,half_opponent_wp
0,2022,1,ARI,KC,0.566792,0.108674,0.433208,0.891326
1,2022,3,ARI,LA,0.546262,0.237108,0.453738,0.762892
2,2022,5,ARI,PHI,0.546262,0.35016,0.453738,0.64984
3,2022,7,ARI,NO,0.566792,0.835941,0.433208,0.164059
4,2022,9,ARI,SEA,0.566792,0.436939,0.433208,0.563061


### <font color="#9370DB">away team statistics</font>

In [10]:
%%time
pivot_cols = [
    Col('away_wp', "team_wp"),
    Col('home_wp', "opponent_wp")
]

away_analytics_df = build_pivot(team_col="away_team", pivot_cols=pivot_cols, additional_cols=[Col("home_team", "opponent")])
away_analytics_df.head()


CPU times: user 3.3 ms, sys: 1.13 ms, total: 4.43 ms
Wall time: 658 ms


Unnamed: 0,season,week,team,opponent,start_team_wp,half_team_wp,start_opponent_wp,half_opponent_wp
0,2022,2,ARI,LV,0.453738,0.034517,0.546262,0.965483
1,2022,4,ARI,CAR,0.433208,0.213143,0.566792,0.786857
2,2022,6,ARI,SEA,0.433208,0.298779,0.566792,0.701221
3,2022,8,ARI,MIN,0.453738,0.305759,0.546262,0.694241
4,2022,10,ARI,LA,0.453738,0.895184,0.546262,0.104816


### <font color="#9370DB">home and away team statistics appended together</font>

In [11]:
team_analytics = pd.concat([home_analytics_df, away_analytics_df])
team_analytics.head()

Unnamed: 0,season,week,team,opponent,start_team_wp,half_team_wp,start_opponent_wp,half_opponent_wp
0,2022,1,ARI,KC,0.566792,0.108674,0.433208,0.891326
1,2022,3,ARI,LA,0.546262,0.237108,0.453738,0.762892
2,2022,5,ARI,PHI,0.546262,0.35016,0.453738,0.64984
3,2022,7,ARI,NO,0.566792,0.835941,0.433208,0.164059
4,2022,9,ARI,SEA,0.566792,0.436939,0.433208,0.563061


# <font color=teal>validate that all of these datasets will merge horizontally</font>
We want one record for each season, week and team.

The metrics themselves pivot horizontally
We expect that with each merge the **number of columns grows but the row count stays the same**


Since we are using this data for dimensionality reduction it's ok if we loose a few rows on the join,
at least for this step.

### <font color="#9370DB">helper functions</font>

In [12]:
from pandas import DataFrame

def calc_coverage(title: str, df: DataFrame):
    first = df.season.min()
    last = df.season.max()
    first_wk = df.week.min()
    last_wk = df.week.max()
    seasons = df.season.nunique()
    print(f"Shape of {title:30}:  {df.shape},\t Contains {seasons} seasons, starting with {first} and ending in {last} min week: {first_wk}, max week : {last_wk}")

def print_columns(title, df):
    print(f"\n---------\n{title.strip()} colums")
    for col in df.columns:
        print(col)

### <font color="#9370DB">get shapes before merge</font>

In [13]:
calc_coverage("Team analytics ", team_analytics)
calc_coverage("ngs_air_power  ", ngs_air_power)
calc_coverage("ngs_ground_power ", ngs_ground_power)
calc_coverage("pbp_events  ", pbp_events)
calc_coverage("player_stats  ", player_stats)


Shape of Team analytics                :  (3812, 8),	 Contains 7 seasons, starting with 2016 and ending in 2022 min week: 1, max week : 22
Shape of ngs_air_power                 :  (3778, 23),	 Contains 7 seasons, starting with 2016 and ending in 2022 min week: 1, max week : 23
Shape of ngs_ground_power              :  (3825, 21),	 Contains 7 seasons, starting with 2016 and ending in 2022 min week: 1, max week : 23
Shape of pbp_events                    :  (3834, 11),	 Contains 7 seasons, starting with 2016 and ending in 2022 min week: 1, max week : 22
Shape of player_stats                  :  (12836, 43),	 Contains 24 seasons, starting with 1999 and ending in 2022 min week: 1, max week : 22


In [14]:
ngs_air_power.drop(columns=['rn'], inplace=True)
ngs_ground_power.drop(columns=['rn'], inplace=True)

In [15]:
if DEBUG:
    print_columns("Team analytics ", team_analytics)
    print_columns("ngs_air_power  ", ngs_air_power)
    print_columns("ngs_ground_power ", ngs_ground_power)
    print_columns("pbp_events  ", pbp_events)
    print_columns("player_stats  ", player_stats)

### <font color="#9370DB">get shapes after each  merge</font>

In [16]:
%%time
merged = pd.merge(ngs_ground_power, ngs_air_power, on=['season', 'week', 'team'])
print("shape after merging ngs_ground_power + ngs_air_power ", merged.shape)
merged = pd.merge(merged, pbp_events, on=['season', 'week', 'team'])
print("shape after merging merged + pbp_events              ", merged.shape)
merged = pd.merge(merged, player_stats, on=['season', 'week', 'team'])
print("shape after merging merged + player_stats            ", merged.shape)
merged = pd.merge(merged, team_analytics, on=['season', 'week', 'team'])
print("shape after merging merged + team_analytics          ", merged.shape)

merged.shape

shape after merging ngs_ground_power + ngs_air_power  (3788, 39)
shape after merging merged + pbp_events               (3569, 47)
shape after merging merged + player_stats             (3569, 87)
shape after merging merged + team_analytics           (3569, 92)
CPU times: user 27.7 ms, sys: 3.84 ms, total: 31.5 ms
Wall time: 30.9 ms


(3569, 92)

In [17]:
%%time
overlaps = 0
for col in merged.columns:
    if str(col).endswith("_y") or str(col).endswith("_x") or str(col) == "rn":
        print(col)
        overlaps += 1

assert overlaps == 0


CPU times: user 40 µs, sys: 1 µs, total: 41 µs
Wall time: 42.2 µs
