In [157]:
import pandas as pd
from pandasql import sqldf
from sqlalchemy import create_engine, text



In [158]:
from NFLVersReader.src.nflverse_clean.pbp_job import perform_workflow

# results = perform_workflow()

In [159]:
# results.keys()

In [160]:
from NFLVersReader.src.nflverse_clean.configs import get_config
from NFLVersReader.src.nflverse_clean.database_loader import DatabaseLoader

db = DatabaseLoader(get_config('connection_string'))

# <font color=teal>next gen stats passing<font/>

In [161]:
ngs_air_power = db.query_to_df("""
    with base as (
    select season, week, team_abbr as team,
           pass_touchdowns,
           avg_time_to_throw,
           avg_completed_air_yards,
           avg_intended_air_yards,
           avg_air_yards_differential,
           aggressiveness,
           max_completed_air_distance,
           avg_air_yards_to_sticks,
           attempts,
           pass_yards,
           pass_touchdowns,
           interceptions,
           passer_rating,
           completions,
           completion_percentage,
           expected_completion_percentage,
           completion_percentage_above_expectation,
           avg_air_distance,
           max_air_distance,
        row_number() over (partition by season, week, team_abbr, player_position order by pass_yards desc) as rn
    from controls.nextgen_pass
--    where season=2016 and week=1 and team_abbr = 'CHI'
    order by team_abbr, player_position, season desc, week )
    select * from base where rn = 1 and week > 0
""")

ngs_air_power.head()


Unnamed: 0,season,week,team,pass_touchdowns,avg_time_to_throw,avg_completed_air_yards,avg_intended_air_yards,avg_air_yards_differential,aggressiveness,max_completed_air_distance,...,pass_touchdowns.1,interceptions,passer_rating,completions,completion_percentage,expected_completion_percentage,completion_percentage_above_expectation,avg_air_distance,max_air_distance,rn
0,2016,1,ARI,2,2.467784,7.406667,11.025946,-3.619279,16.216216,45.425245,...,2,0,104.673423,24,64.864865,60.037599,4.827266,24.857483,45.425245,1
1,2016,2,ARI,3,2.7427,11.406471,14.317,-2.910529,16.666667,48.820791,...,3,0,124.861111,17,56.666667,47.003623,9.663043,28.609856,57.395959,1
2,2016,3,ARI,0,2.47584,7.529615,11.4246,-3.894985,24.0,37.763608,...,0,4,36.0,26,52.0,60.366606,-8.366606,24.475806,53.821172,1
3,2016,4,ARI,1,2.485611,8.442609,10.628333,-2.185725,11.111111,41.334355,...,1,1,86.342593,23,63.888889,59.860299,4.02859,24.623706,54.582351,1
4,2016,5,ARI,2,2.924143,7.213636,12.756786,-5.543149,32.142857,47.097164,...,2,0,77.083333,11,39.285714,50.875469,-11.589754,26.525082,56.684255,1


## <font color=teal>next gen stats rushing<font/>

In [162]:
ngs_ground_power = db.query_to_df("""
with junk as (
    select season, week, team_abbr as team, rush_yards,
           efficiency,
           percent_attempts_gte_eight_defenders,
           avg_time_to_los,
           rush_attempts,
           rush_yards,
           expected_rush_yards,
           rush_yards_over_expected,
           avg_rush_yards,
           rush_yards_over_expected_per_att,
           rush_pct_over_expected,
           rush_touchdowns,
           player_gsis_id,
           player_first_name,
           player_last_name,
           player_jersey_number,
           player_short_name,
               row_number() over (partition by season, week, team_abbr order by rush_yards desc) as rn
    from controls.nextgen_rush
    order by  team_abbr, season desc, week)
select * from junk where week > 0
""" )

ngs_ground_power.head()

Unnamed: 0,season,week,team,rush_yards,efficiency,percent_attempts_gte_eight_defenders,avg_time_to_los,rush_attempts,rush_yards.1,expected_rush_yards,...,avg_rush_yards,rush_yards_over_expected_per_att,rush_pct_over_expected,rush_touchdowns,player_gsis_id,player_first_name,player_last_name,player_jersey_number,player_short_name,rn
0,2016,1,ARI,89,3.107978,43.75,2.616188,16,89,,...,5.5625,,,1,00-0032187,David,Johnson,31,D.Johnson,1
1,2016,2,ARI,54,4.591296,50.0,2.450687,16,54,,...,3.375,,,1,00-0026164,Chris,Johnson,23,C.Johnson,1
2,2016,2,ARI,45,5.030444,8.333333,3.162417,12,45,,...,3.75,,,0,00-0032187,David,Johnson,31,D.Johnson,2
3,2016,3,ARI,83,4.171807,26.315789,2.619,19,83,,...,4.368421,,,2,00-0032187,David,Johnson,31,D.Johnson,1
4,2016,4,ARI,83,3.722651,29.411765,2.700235,17,83,,...,4.882353,,,0,00-0032187,David,Johnson,31,D.Johnson,1


# <font color=teal>play-by-play offense events<font/>

In [163]:
pbp_events = db.query_to_df("""
SELECT
    season, week, team,
    SUM(CASE WHEN event = 'fumble' THEN 1 else 0 END) AS fumble,
    SUM(CASE WHEN event = 'own_kickoff_recovery' THEN 1 else 0 END) AS own_kickoff_recovery,
    SUM(CASE WHEN event = 'safety' THEN 1 else 0 END) AS safety,
    SUM(CASE WHEN event = 'tackle' THEN 1 else 0 END) AS tackle,
    SUM(CASE WHEN event = 'qb_hit' THEN 1 else 0  END) AS qb_hit,
    SUM(CASE WHEN event = 'touchdown' THEN 1  else 0 END) AS touchdown,
    SUM(CASE WHEN event = 'interception' THEN 1 else 0 END) AS interception,
    SUM(CASE WHEN event = 'sack' THEN 1 else 0 END) AS sack
FROM controls.player_events where week > 0
group by season, week, team
order by season desc, team, week
""")

pbp_events.head()

Unnamed: 0,season,week,team,fumble,own_kickoff_recovery,safety,tackle,qb_hit,touchdown,interception,sack
0,2021,1,ARI,3,0,0,65,9,5,1,6
1,2021,2,ARI,3,0,0,63,4,4,0,1
2,2021,3,ARI,4,0,0,73,7,4,2,2
3,2021,4,ARI,2,0,0,58,5,4,1,0
4,2021,5,ARI,4,0,0,60,8,2,1,3


# <font color=teal>Player stats<font/>

In [164]:
player_stats = db.query_to_df("""
select
    season,
    week,
    team,
    sum(completions) as completions,
    sum(attempts) as attempts,
    sum(passing_yards) as passing_yards,
    sum(passing_tds) as passing_tds,
    sum(interceptions) as interceptions,
    sum(sacks) as sacks,
    sum(sack_yards) as sack_yards,
    sum(sack_fumbles) as sack_fumbles,
    sum(sack_fumbles_lost) as sack_fumbles_lost,
    sum(passing_air_yards) as passing_air_yards,
    sum(passing_yards_after_catch) as passing_yards_after_catch,
    sum(passing_first_downs) as passing_first_downs,
    avg(passing_epa) as passing_epa,
    sum(passing_2pt_conversions) as passing_2pt_conversions,
    avg(pacr) as avg_pacr,
    avg(dakota) as avg_dakota,
    sum(carries) as carries,
    sum(rushing_yards) as rushing_yards,
    sum(rushing_tds) as rushing_tds,
    sum(rushing_fumbles) as rushing_fumbles,
    sum(rushing_fumbles_lost) as rushing_fumbles_lost,
    sum(rushing_first_downs) as rushing_first_downs,
    avg(rushing_epa) as avg_rushing_epa,
    sum(rushing_2pt_conversions) as rushing_2pt_conversions,
    sum(receptions) as receptions,
    sum(targets) as targets,
    sum(receiving_yards) as receiving_yards,
    sum(receiving_tds) as receiving_tds,
    sum(receiving_fumbles) as receiving_fumbles,
    sum(receiving_fumbles_lost) as receiving_fumbles_lost,
    sum(receiving_air_yards) as receiving_air_yards,
    sum(receiving_yards_after_catch) as receiving_yards_after_catch,
    sum(receiving_first_downs) as receiving_first_downs,
    avg(receiving_epa) as avg_receiving_epa,
    sum(receiving_2pt_conversions) as receiving_2pt_conversions,
    sum(racr) as racr,
    sum(target_share) as target_share,
    sum(air_yards_share) as air_yards_share,
    sum(wopr) as wopr,
    sum(special_teams_tds) as special_teams_tds
from controls.player_stats
group by season,
week,
team
order by season desc, team,  week""")

player_stats.head()

Unnamed: 0,season,week,team,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,...,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,avg_receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds
0,2022,1,ARI,24,38,205,2,0,3,26,...,266,115,12,0.680728,1,23.36395,1.0,1.0,2.2,0
1,2022,2,ARI,31,49,277,1,1,1,7,...,239,162,13,-0.302323,1,4.044793,1.0,1.0,2.2,0
2,2022,3,ARI,37,58,314,0,0,2,19,...,332,166,15,0.12071,0,-14.124653,1.0,1.0,2.2,0
3,2022,4,ARI,23,32,207,2,1,1,1,...,182,87,10,0.059163,0,1.130505,1.0,1.0,2.2,0
4,2022,5,ARI,28,42,250,1,1,1,11,...,293,144,16,1.347603,0,-14.683343,1.0,1.0,2.2,0


In [165]:
from typing import List, NamedTuple

class Col(NamedTuple):
    name: str
    alias: str

def build_pivot_sql(team_col: str,  pivot_cols: List[Col], additional_cols: List[Col]):
    db_table = 'controls.play_analytics'

    base_cols = ['season',
                 'week']
    metrics = []
    for col in pivot_cols:
        metrics.append(
            f"""
            MAX(CASE WHEN RN = 1 THEN {col.alias} END) AS start_{col.alias},
            MAX(CASE WHEN RN = (total_rows/2) THEN {col.alias} END) AS half_{col.alias}""")

    inner_cols = base_cols +  [f"{team_col} as team "] + [f"{p.name} as {p.alias}" for p in pivot_cols] + [f"{p.name} as {p.alias}" for p in additional_cols]
    inner_select = ",".join(inner_cols)

    outer_cols = base_cols + ["team"] +  [f"{p.alias}" for p in additional_cols] + metrics
    outer_select = ",".join(outer_cols)
    outer_group = ",".join(base_cols + ["team"] +  [f"{p.alias}" for p in additional_cols])

    station_cols = base_cols + [team_col]
    stations = ",".join(station_cols)

    sql = f"""
    WITH ranked_rows AS (
        SELECT {inner_select},
             ROW_NUMBER() OVER (PARTITION BY
                {stations} ORDER BY play_counter) AS RN,
             COUNT(*) OVER (PARTITION BY
                {stations}) AS total_rows
        FROM {db_table}
    )
    SELECT
        {outer_select}
    FROM ranked_rows
    GROUP BY {outer_group}
    order by season desc, team, week
    """

    # print(sql)
    return sql

def build_pivot(team_col: str,  pivot_cols: List[Col],  additional_cols: List[Col]):
    return db.query_to_df(  build_pivot_sql(team_col, pivot_cols, additional_cols) )


In [166]:
pivot_cols = [
    Col('home_wp', "team_wp"),
    Col('away_wp', "opponent_wp"),
]

home_analytics_df = build_pivot(team_col="home_team", pivot_cols=pivot_cols, additional_cols=[Col("away_team", "opponent")])
home_analytics_df.head()

Unnamed: 0,season,week,team,opponent,start_team_wp,half_team_wp,start_opponent_wp,half_opponent_wp
0,2021,2,ARI,MIN,0.566792,0.541654,0.433208,0.458346
1,2021,5,ARI,SF,0.566792,0.85329,0.433208,0.14671
2,2021,7,ARI,HOU,0.546262,0.870703,0.453738,0.129297
3,2021,8,ARI,GB,0.566792,0.362228,0.433208,0.637772
4,2021,10,ARI,CAR,0.546262,0.045256,0.453738,0.954744


In [167]:
pivot_cols = [
    Col('away_wp', "team_wp"),
    Col('home_wp', "opponent_wp")
]

away_analytics_df = build_pivot(team_col="away_team", pivot_cols=pivot_cols, additional_cols=[Col("home_team", "opponent")])
away_analytics_df.head()


Unnamed: 0,season,week,team,opponent,start_team_wp,half_team_wp,start_opponent_wp,half_opponent_wp
0,2021,1,ARI,TEN,0.453738,0.875492,0.546262,0.124508
1,2021,3,ARI,JAX,0.453738,0.286349,0.546262,0.713651
2,2021,4,ARI,LA,0.453738,0.846534,0.546262,0.153466
3,2021,6,ARI,CLE,0.453738,0.931544,0.546262,0.068456
4,2021,9,ARI,SF,0.453738,0.804995,0.546262,0.195005


In [233]:
team_analytics = pd.concat([home_analytics_df, away_analytics_df])
team_analytics.head()

Unnamed: 0,season,week,team,opponent,start_team_wp,half_team_wp,start_opponent_wp,half_opponent_wp
0,2021,2,ARI,MIN,0.566792,0.541654,0.433208,0.458346
1,2021,5,ARI,SF,0.566792,0.85329,0.433208,0.14671
2,2021,7,ARI,HOU,0.546262,0.870703,0.453738,0.129297
3,2021,8,ARI,GB,0.566792,0.362228,0.433208,0.637772
4,2021,10,ARI,CAR,0.546262,0.045256,0.453738,0.954744


In [237]:
from pandas import DataFrame


def calc_coverage(title: str, df: DataFrame):
    first = df.season.min()
    last = df.season.max()
    first_wk = df.week.min()
    last_wk = df.week.max()
    seasons = df.season.nunique()
    print(f"Shape of {title:30}:  {df.shape},\t Contains {seasons} seasons, starting with {first} and ending in {last} min week: {first_wk}, max week : {last_wk}")



In [238]:
calc_coverage("Team analytics ", team_analytics)
calc_coverage("ngs_air_power  ", ngs_air_power)
calc_coverage("ngs_ground_power ", ngs_ground_power)
calc_coverage("pbp_events  ", pbp_events)
calc_coverage("player_stats  ", player_stats)


Shape of Team analytics                :  (570, 8),	 Contains 1 seasons, starting with 2021 and ending in 2021 min week: 1, max week : 22
Shape of ngs_air_power                 :  (531, 23),	 Contains 1 seasons, starting with 2016 and ending in 2016 min week: 1, max week : 22
Shape of ngs_ground_power              :  (526, 21),	 Contains 1 seasons, starting with 2016 and ending in 2016 min week: 1, max week : 22
Shape of pbp_events                    :  (572, 11),	 Contains 1 seasons, starting with 2021 and ending in 2021 min week: 1, max week : 22
Shape of player_stats                  :  (12836, 43),	 Contains 24 seasons, starting with 1999 and ending in 2022 min week: 1, max week : 22


In [253]:

merged = pd.merge(ngs_ground_power, ngs_air_power, on=['season', 'week', 'team'])
merged = pd.merge(merged, pbp_events, on=['season', 'week', 'team'])
merged.shape

# merged = pd.merge(merged, pbp_events, on=['season', 'week', 'team'])
# merged.shape

# merged = pd.merge(merged, team_analytics, on=['season', 'week', 'team'])


(0, 49)

In [248]:
ngs_ground_power.loc[(ngs_ground_power.season==2016) & (ngs_ground_power.week==11) & (ngs_ground_power.team=='BAL')]


Unnamed: 0,season,week,team,rush_yards,efficiency,percent_attempts_gte_eight_defenders,avg_time_to_los,rush_attempts,rush_yards.1,expected_rush_yards,...,avg_rush_yards,rush_yards_over_expected_per_att,rush_pct_over_expected,rush_touchdowns,player_gsis_id,player_first_name,player_last_name,player_jersey_number,player_short_name,rn
