# Day 41

Yesterday I developed the query to combine the majority of stats I have for QBs, WRs, RBs, and TEs. 

My focus today is cleaning up the query by removing duplicate columns post-join and uploading to Tableau for initial analysis.

In [9]:
import pandas as pd
import sqlite3

# Create database connection
conn = sqlite3.connect('../../data/db/database.db')

## Query the Data

In [10]:
query = """
WITH weekly_data AS (
    SELECT
        player_id,
        player_display_name AS player_name,
        position,
        recent_team,
        season,
        week,
        season_type,
        sacks,
        passing_air_yards AS pass_air_yds,
        passing_yards_after_catch AS pass_yac,
        passing_epa AS pass_epa,
        passing_2pt_conversions AS pass_2pt_conv,
        pacr,
        rushing_epa AS rush_epa, 
        receiving_air_yards AS rec_air_yds,
        receiving_yards_after_catch AS rec_yac,
        receiving_epa AS rec_epa,
        racr,
        target_share,
        air_yards_share,
        wopr,
        fantasy_points AS fantasy_pts,
        fantasy_points_ppr AS fantasy_pts_ppr
    FROM weekly
    WHERE season = 2022
        AND week <= 12
        AND position IN ('QB', 'WR', 'RB', 'TE')),
pfr_pass_data AS (
    SELECT
        season,
        week,
        opponent,
        pfr_player_name AS player_name,
        passing_bad_throws,
        passing_bad_throw_pct,
        times_sacked,
        times_blitzed,
        times_hurried,
        times_hit,
        times_pressured,
        times_pressured_pct,
        ids.gsis_id
    FROM pfr_pass
    JOIN ids
        ON ids.pfr_id = pfr_pass.pfr_player_id
    WHERE season = 2022
        AND week <= 12),
pfr_rec_data AS (
    SELECT
        season,
        week,
        opponent,
        pfr_player_name AS player_name,
        receiving_broken_tackles,
        receiving_drop,
        receiving_drop_pct,
        receiving_int,
        receiving_rat,
        ids.gsis_id
    FROM pfr_rec
    JOIN ids
        ON ids.pfr_id = pfr_rec.pfr_player_id
    WHERE season = 2022
        AND week <= 12),
pfr_rush_data AS (
    SELECT
        season,
        week,
        opponent,
        pfr_player_name AS player_name,
        carries,
        rushing_yards_before_contact AS rush_yds_before_contact,
        rushing_yards_before_contact_avg AS rush_yds_before_contact_avg,
        rushing_yards_after_contact AS rush_yds_after_contact,
        rushing_yards_after_contact_avg AS rush_yds_after_contact_avg,
        rushing_broken_tackles AS rush_broken_tackles,
        ids.gsis_id
    FROM pfr_rush
    JOIN ids
        ON ids.pfr_id = pfr_rush.pfr_player_id
    WHERE season = 2022
        AND week <= 12),
ngs_pass_data AS (
    SELECT
        season,
        week,
        player_display_name AS player_name,
        attempts,
        pass_yards,
        pass_touchdowns AS pass_tds,
        interceptions,
        avg_time_to_throw,
        avg_completed_air_yards,
        avg_air_yards_differential,
        aggressiveness,
        max_completed_air_distance,
        avg_air_yards_to_sticks,
        passer_rating,
        completions,
        completion_percentage,
        expected_completion_percentage,
        completion_percentage_above_expectation,
        avg_air_distance,
        max_air_distance,
        player_gsis_id
    FROM ngs_pass
    WHERE season = 2022
        AND week BETWEEN 1 AND 12),
ngs_rec_data AS (
    SELECT
        season,
        week,
        player_display_name AS player_name,
        avg_cushion,
        avg_separation,
        avg_intended_air_yards,
        percent_share_of_intended_air_yards,
        receptions,
        targets,
        catch_percentage,
        yards,
        rec_touchdowns,
        avg_yac,
        avg_expected_yac,
        avg_yac_above_expectation,
        player_gsis_id
    FROM ngs_rec
    WHERE season = 2022
        AND week BETWEEN 1 AND 12),
ngs_rush_data AS (
    SELECT
        season,
        week,
        player_display_name AS player_name,
        efficiency,
        percent_attempts_gte_eight_defenders,
        avg_time_to_los,
        rush_attempts,
        rush_yards,
        expected_rush_yards,
        rush_yards_over_expected,
        avg_rush_yards,
        rush_yards_over_expected_per_att,
        rush_pct_over_expected,
        rush_touchdowns,
        player_gsis_id
    FROM ngs_rush
    WHERE season = 2022
        AND week BETWEEN 1 AND 12),
snap_counts_data AS (
    SELECT
        season,
        week,
        player AS player_name,
        --opponent,
        offense_snaps,
        offense_pct AS offense_snaps_pct,
        defense_snaps,
        defense_pct AS defense_snaps_pct,
        st_snaps,
        st_pct AS st_snaps_pct,
        ids.gsis_id
    FROM snap_counts
    JOIN ids
        ON ids.pfr_id = snap_counts.pfr_player_id
    WHERE season = 2022
        AND week <= 12),
joined_tables AS (
    SELECT
        player_id,
        weekly_data.player_name,
        weekly_data.position,
        recent_team,
        weekly_data.season,
        weekly_data.week,
        season_type,
        sacks,
        pass_air_yds,
        pass_yac,
        pass_epa,
        pass_2pt_conv,
        pacr,
        rush_epa, 
        rec_air_yds,
        rec_yac,
        rec_epa,
        racr,
        target_share,
        air_yards_share,
        wopr,
        fantasy_pts,
        fantasy_pts_ppr,
        passing_bad_throws,
        passing_bad_throw_pct,
        times_sacked,
        times_blitzed,
        times_hurried,
        times_hit,
        times_pressured,
        times_pressured_pct,
        receiving_broken_tackles,
        receiving_drop,
        receiving_drop_pct,
        receiving_int,
        receiving_rat,
        carries,
        rush_yds_before_contact,
        rush_yds_before_contact_avg,
        rush_yds_after_contact,
        rush_yds_after_contact_avg,
        rush_broken_tackles,
        attempts,
        pass_yards,
        pass_tds,
        interceptions,
        avg_time_to_throw,
        avg_completed_air_yards,
        avg_intended_air_yards,
        avg_air_yards_differential,
        aggressiveness,
        max_completed_air_distance,
        avg_air_yards_to_sticks,
        passer_rating,
        completions,
        completion_percentage,
        expected_completion_percentage,
        completion_percentage_above_expectation,
        avg_air_distance,
        max_air_distance,
        avg_cushion,
        avg_separation,
        avg_intended_air_yards,
        percent_share_of_intended_air_yards,
        receptions,
        targets,
        catch_percentage,
        yards,
        rec_touchdowns,
        avg_yac,
        avg_expected_yac,
        avg_yac_above_expectation,
        efficiency,
        percent_attempts_gte_eight_defenders,
        avg_time_to_los,
        rush_attempts,
        rush_yards,
        expected_rush_yards,
        rush_yards_over_expected,
        avg_rush_yards,
        rush_yards_over_expected_per_att,
        rush_pct_over_expected,
        rush_touchdowns,
        offense_snaps,
        offense_snaps_pct,
        defense_snaps,
        defense_snaps_pct,
        st_snaps,
        st_snaps_pct
    FROM weekly_data
    LEFT JOIN pfr_pass_data
        ON pfr_pass_data.gsis_id = weekly_data.player_id
            AND pfr_pass_data.season = weekly_data.season
            AND pfr_pass_data.week = weekly_data.week
    LEFT JOIN pfr_rec_data
        ON pfr_rec_data.gsis_id = weekly_data.player_id
            AND pfr_rec_data.season = weekly_data.season
            AND pfr_rec_data.week = weekly_data.week
    LEFT JOIN pfr_rush_data
        ON pfr_rush_data.gsis_id = weekly_data.player_id
            AND pfr_rush_data.season = weekly_data.season
            AND pfr_rush_data.week = weekly_data.week
    LEFT JOIN ngs_pass_data
        ON ngs_pass_data.player_gsis_id = weekly_data.player_id
            AND ngs_pass_data.season = weekly_data.season
            AND ngs_pass_data.week = weekly_data.week
    LEFT JOIN ngs_rec_data
        ON ngs_rec_data.player_gsis_id = weekly_data.player_id
            AND ngs_rec_data.season = weekly_data.season
            AND ngs_rec_data.week = weekly_data.week
    LEFT JOIN ngs_rush_data
        ON ngs_rush_data.player_gsis_id = weekly_data.player_id
            AND ngs_rush_data.season = weekly_data.season
            AND ngs_rush_data.week = weekly_data.week
    LEFT JOIN snap_counts_data
        ON snap_counts_data.gsis_id = weekly_data.player_id
            AND snap_counts_data.season = weekly_data.season
            AND snap_counts_data.week = weekly_data.week
)
SELECT *
FROM joined_tables
"""

df = pd.read_sql(query, conn)
df.head(10)

Unnamed: 0,player_id,player_name,position,recent_team,season,week,season_type,sacks,pass_air_yds,pass_yac,...,avg_rush_yards,rush_yards_over_expected_per_att,rush_pct_over_expected,rush_touchdowns,offense_snaps,offense_snaps_pct,defense_snaps,defense_snaps_pct,st_snaps,st_snaps_pct
0,00-0019596,Tom Brady,QB,TB,2022,1,REG,2.0,305.0,87.0,...,,,,,62.0,1.0,0.0,0.0,0.0,0.0
1,00-0019596,Tom Brady,QB,TB,2022,2,REG,1.0,318.0,81.0,...,,,,,67.0,1.0,0.0,0.0,1.0,0.04
2,00-0019596,Tom Brady,QB,TB,2022,3,REG,3.0,236.0,124.0,...,,,,,65.0,1.0,0.0,0.0,0.0,0.0
3,00-0019596,Tom Brady,QB,TB,2022,4,REG,1.0,326.0,205.0,...,,,,,64.0,1.0,0.0,0.0,0.0,0.0
4,00-0019596,Tom Brady,QB,TB,2022,5,REG,0.0,356.0,178.0,...,,,,,79.0,1.0,0.0,0.0,0.0,0.0
5,00-0019596,Tom Brady,QB,TB,2022,6,REG,2.0,302.0,87.0,...,,,,,72.0,1.0,0.0,0.0,0.0,0.0
6,00-0019596,Tom Brady,QB,TB,2022,7,REG,1.0,295.0,134.0,...,,,,,67.0,1.0,0.0,0.0,0.0,0.0
7,00-0019596,Tom Brady,QB,TB,2022,8,REG,3.0,288.0,177.0,...,,,,,68.0,1.0,0.0,0.0,0.0,0.0
8,00-0019596,Tom Brady,QB,TB,2022,9,REG,1.0,271.0,127.0,...,,,,,84.0,1.0,0.0,0.0,0.0,0.0
9,00-0019596,Tom Brady,QB,TB,2022,10,REG,0.0,248.0,102.0,...,,,,,75.0,1.0,0.0,0.0,0.0,0.0


In [8]:
for col in df.columns:
    print(col)

player_id
player_name
position
recent_team
season
week
season_type
sacks
pass_air_yds
pass_yac
pass_epa
pass_2pt_conv
pacr
rush_epa
rec_air_yds
rec_yac
rec_epa
racr
target_share
air_yards_share
wopr
fantasy_pts
fantasy_pts_ppr
passing_bad_throws
passing_bad_throw_pct
times_sacked
times_blitzed
times_hurried
times_hit
times_pressured
times_pressured_pct
receiving_broken_tackles
receiving_drop
receiving_drop_pct
receiving_int
receiving_rat
carries
rush_yds_before_contact
rush_yds_before_contact_avg
rush_yds_after_contact
rush_yds_after_contact_avg
rush_broken_tackles
attempts
pass_yards
pass_tds
interceptions
avg_time_to_throw
avg_completed_air_yards
avg_intended_air_yards
avg_air_yards_differential
aggressiveness
max_completed_air_distance
avg_air_yards_to_sticks
passer_rating
completions
completion_percentage
expected_completion_percentage
completion_percentage_above_expectation
avg_air_distance
max_air_distance
avg_cushion
avg_separation
avg_intended_air_yards:1
percent_share_of_inten

Duplicate columns have been removed.  

I think the query itself can be cleaned up a little by selecting all rows in my CTE queries. That way there aren't fields in multiple SELECT statements. I'll clean that up another time.