# <font color=teal>imports</font>
Most processing is performed in python code, and there's a python module to do everything here without manual

In [1]:
import logging

from src import configs
import os
import sys
sys.path.append(os.path.abspath("../src"))



In [2]:
from src.nflverse_transform_job import load_files
from src.pbp_fact import transform_pbp
from src.nflverse_reader_job import URLReader
from src.pbp_participation import transform_pbp_participation
from src.player_stats import transform_player_stats, merge_injuries
from src.player_injuries import prep_player_injuries
from src.player_stats import transform_players
from src.db_utils import load_dims_to_db




# <font color=teal>housekeeping</font>

In [3]:
LOAD_TO_DB = True
database_schema = 'controls'

# Get the logger
logger = configs.configure_logging("pbp_logger")
logger.setLevel(logging.INFO)

# <font color=teal>Read data from NFLVerse<font/>
Step 1.  Read data and store immediately as raw without transformation or change

In [4]:
%%time

reader = URLReader(start_year=2016, last_year=2022, file_type='csv')
urls = reader.download()

2023-07-05 17:02:55,730 - INFO - Success: https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_2016_rushing.csv.gz
2023-07-05 17:02:55,814 - INFO - Success: https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_2016_passing.csv.gz
2023-07-05 17:02:55,819 - INFO - Success: https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_2016_receiving.csv.gz
2023-07-05 17:02:55,872 - INFO - Success: https://github.com/nflverse/nflverse-data/releases/download/injuries/injuries_2016.csv
2023-07-05 17:02:56,386 - INFO - Success: https://github.com/nflverse/nflverse-data/releases/download/players/players.csv
2023-07-05 17:02:56,626 - INFO - Success: https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_2017_passing.csv.gz
2023-07-05 17:02:56,968 - INFO - Success: https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_2017_receiving.csv.gz
2023-07-05 17:02:57,034 - INFO - Success: htt

CPU times: user 3.19 s, sys: 2.23 s, total: 5.42 s
Wall time: 22.3 s


---

# <font color=teal>load and transform play by play datasets</font>

### <font color="#9370DB">load</font>

In [5]:

%%time
pbp = load_files(data_subdir='pbp')


2023-07-05 17:03:16,938 - INFO - Reading all files from pbp
2023-07-05 17:03:16,939 - INFO -   + Reading pbp_2022.csv
2023-07-05 17:03:17,982 - INFO -   + Reading pbp_2020.csv
2023-07-05 17:03:18,997 - INFO -   + Reading pbp_2021.csv
2023-07-05 17:03:19,998 - INFO -   + Reading pbp_2019.csv
2023-07-05 17:03:21,000 - INFO -   + Reading pbp_2018.csv
2023-07-05 17:03:21,988 - INFO -   + Reading pbp_2016.csv
2023-07-05 17:03:23,003 - INFO -   + Reading pbp_2017.csv


CPU times: user 6.53 s, sys: 1.4 s, total: 7.93 s
Wall time: 8.36 s


### <font color="#9370DB">transform</font>

In [6]:

%%time
datasets = transform_pbp(pbp)

2023-07-05 17:03:27,187 - INFO - Impute columns to 0
2023-07-05 17:03:27,376 - INFO - impute non binary pbp columns ...
2023-07-05 17:03:27,943 - INFO - Impute columns to 0
2023-07-05 17:03:28,593 - INFO - Impute columns to 0:00
2023-07-05 17:03:29,783 - INFO - Impute columns to NA
2023-07-05 17:03:34,856 - INFO - moving play_id to play_counter, and creating a joinable play_id key
2023-07-05 17:03:35,383 - INFO - Conform key actions like pass, rush, kickoff, etc. and add a single category field called actions... 
2023-07-05 17:03:40,197 - INFO - Validate actions dimension ...
2023-07-05 17:03:40,370 - INFO - Creating new drive dimension...
2023-07-05 17:03:40,422 - INFO - Validate drive_df dimension ...
2023-07-05 17:03:40,651 - INFO - Creating new situations dimension...
2023-07-05 17:03:40,703 - INFO - Validate situation_df dimension ...
2023-07-05 17:03:40,899 - INFO - Creating new metrics dimension...
2023-07-05 17:03:40,942 - INFO - Validate play_metrics_df dimension ...
2023-07-0

CPU times: user 15 s, sys: 4.78 s, total: 19.8 s
Wall time: 20.5 s


---

# <font color=teal>load and transform play by play participation datasets</font>

### <font color="#9370DB">load</font>

In [7]:
%%time
pbp_participation_df = load_files('pbp-participation')


2023-07-05 17:03:45,790 - INFO - Reading all files from pbp-participation
2023-07-05 17:03:45,791 - INFO -   + Reading pbp-participation_2018.csv
2023-07-05 17:03:45,945 - INFO -   + Reading pbp-participation_2019.csv
2023-07-05 17:03:46,092 - INFO -   + Reading pbp-participation_2021.csv
2023-07-05 17:03:46,250 - INFO -   + Reading pbp-participation_2020.csv
2023-07-05 17:03:46,403 - INFO -   + Reading pbp-participation_2022.csv
2023-07-05 17:03:46,551 - INFO -   + Reading pbp-participation_2017.csv
2023-07-05 17:03:46,700 - INFO -   + Reading pbp-participation_2016.csv


CPU times: user 1.01 s, sys: 117 ms, total: 1.13 s
Wall time: 1.13 s


### <font color="#9370DB">transform</font>

In [8]:

player_df, player_events_df = transform_pbp_participation(
    participation_df=pbp_participation_df,
    player_events=datasets['player_events'])

datasets.update({
    'player_participation': player_df,
    'player_events': player_events_df,
})

2023-07-05 17:03:46,923 - INFO - pbp_participation:  move play_id to a play_count column and create a unique play_id that can be used in joins...
2023-07-05 17:03:47,140 - INFO - Calculating defense and offense team names by player and play...
2023-07-05 17:03:49,462 - INFO - Exploding offensive players to their own dataset...
2023-07-05 17:03:50,911 - INFO - Exploding defense_players to their own dataset...


---

# <font color=teal>transform player injuries<</font>

### <font color="#9370DB">load</font>

In [9]:
%%time
injuries_df = load_files('injuries')

2023-07-05 17:04:02,675 - INFO - Reading all files from injuries
2023-07-05 17:04:02,676 - INFO -   + Reading injuries_2016.csv
2023-07-05 17:04:02,684 - INFO -   + Reading injuries_2017.csv
2023-07-05 17:04:02,693 - INFO -   + Reading injuries_2020.csv
2023-07-05 17:04:02,703 - INFO -   + Reading injuries_2021.csv
2023-07-05 17:04:02,712 - INFO -   + Reading injuries_2022.csv
2023-07-05 17:04:02,721 - INFO -   + Reading injuries_2019.csv
2023-07-05 17:04:02,730 - INFO -   + Reading injuries_2018.csv


CPU times: user 63.5 ms, sys: 6.48 ms, total: 69.9 ms
Wall time: 71.2 ms


### <font color="#9370DB">transform</font>

In [10]:
%%time
injuries_df = prep_player_injuries(injuries_df)

2023-07-05 17:04:02,749 - INFO - Prep injury data...
2023-07-05 17:04:02,750 - INFO - Conforming names (e.g. gsis_id -> player_id)
2023-07-05 17:04:02,758 - INFO - Merge sparse injury columns
2023-07-05 17:04:02,760 - INFO - Get best values for null report_statuses...
2023-07-05 17:04:02,827 - INFO - check that all positions are correct...


CPU times: user 78.7 ms, sys: 4.03 ms, total: 82.8 ms
Wall time: 83.6 ms


---

# <font color=teal>transform player stats</font>

In [11]:
%%time
stats_df = load_files('player-stats')
stats_df = transform_player_stats(stats_df)
stats_df = merge_injuries(player_stats=stats_df, player_injuries=injuries_df)

2023-07-05 17:04:02,836 - INFO - Reading all files from player-stats
2023-07-05 17:04:02,837 - INFO -   + Reading player-stats.csv
2023-07-05 17:04:03,118 - INFO - fix specific player_stats: <function player_stats_fixes at 0x13086c700>..
2023-07-05 17:04:03,189 - INFO - replace empty position_groups with position info...
2023-07-05 17:04:03,200 - INFO - replace empty player_name with player_display_name info...
2023-07-05 17:04:03,209 - INFO - replace empty headshot_url with 'none'...
2023-07-05 17:04:03,217 - INFO - fillna(0) for all binary columns...
2023-07-05 17:04:03,218 - INFO - Impute columns to 0


CPU times: user 472 ms, sys: 99.4 ms, total: 571 ms
Wall time: 569 ms


---

# <font color=teal>direct loads </font>

### <font color="#9370DB">adv stats</font>

In [12]:
%%time

advstats_def_df = load_files('advstats-season-def')
advstats_pass_df = load_files('advstats-season-pass')
advstats_rec_df = load_files('advstats-season-rec')
advstats_rush_df = load_files('advstats-season-rush')


2023-07-05 17:04:03,408 - INFO - Reading all files from advstats-season-def
2023-07-05 17:04:03,409 - INFO -   + Reading advstats-season-def.csv
2023-07-05 17:04:03,418 - INFO - Reading all files from advstats-season-pass
2023-07-05 17:04:03,418 - INFO -   + Reading advstats-season-pass.csv
2023-07-05 17:04:03,421 - INFO - Reading all files from advstats-season-rec
2023-07-05 17:04:03,422 - INFO -   + Reading advstats-season-rec.csv
2023-07-05 17:04:03,427 - INFO - Reading all files from advstats-season-rush
2023-07-05 17:04:03,427 - INFO -   + Reading advstats-season-rush.csv


CPU times: user 18.5 ms, sys: 4.59 ms, total: 23.1 ms
Wall time: 22.9 ms


### <font color="#9370DB">nextgen stats</font>

In [13]:
%%time
next_pass_df = load_files('nextgen-passing')


2023-07-05 17:04:03,434 - INFO - Reading all files from nextgen-passing
2023-07-05 17:04:03,435 - INFO -   + Reading nextgen-passing_2017.csv.gz
2023-07-05 17:04:03,439 - INFO -   + Reading nextgen-passing_2021.csv.gz
2023-07-05 17:04:03,443 - INFO -   + Reading nextgen-passing_2019.csv.gz
2023-07-05 17:04:03,448 - INFO -   + Reading nextgen-passing_2020.csv.gz
2023-07-05 17:04:03,452 - INFO -   + Reading nextgen-passing_2016.csv.gz
2023-07-05 17:04:03,457 - INFO -   + Reading nextgen-passing_2022.csv.gz
2023-07-05 17:04:03,463 - INFO -   + Reading nextgen-passing_2018.csv.gz


CPU times: user 28.9 ms, sys: 4.96 ms, total: 33.8 ms
Wall time: 35.4 ms


In [14]:
%%time
next_rec_df = load_files('nextgen-receiving')


2023-07-05 17:04:03,472 - INFO - Reading all files from nextgen-receiving
2023-07-05 17:04:03,473 - INFO -   + Reading nextgen-receiving_2021.csv.gz
2023-07-05 17:04:03,479 - INFO -   + Reading nextgen-receiving_2017.csv.gz
2023-07-05 17:04:03,485 - INFO -   + Reading nextgen-receiving_2019.csv.gz
2023-07-05 17:04:03,490 - INFO -   + Reading nextgen-receiving_2016.csv.gz
2023-07-05 17:04:03,496 - INFO -   + Reading nextgen-receiving_2020.csv.gz
2023-07-05 17:04:03,503 - INFO -   + Reading nextgen-receiving_2018.csv.gz
2023-07-05 17:04:03,509 - INFO -   + Reading nextgen-receiving_2022.csv.gz


CPU times: user 38.2 ms, sys: 5.88 ms, total: 44.1 ms
Wall time: 44.8 ms


In [15]:
%%time
next_rush_df = load_files('nextgen-rushing')

2023-07-05 17:04:03,520 - INFO - Reading all files from nextgen-rushing
2023-07-05 17:04:03,520 - INFO -   + Reading nextgen-rushing_2018.csv.gz
2023-07-05 17:04:03,524 - INFO -   + Reading nextgen-rushing_2022.csv.gz
2023-07-05 17:04:03,529 - INFO -   + Reading nextgen-rushing_2016.csv.gz
2023-07-05 17:04:03,532 - INFO -   + Reading nextgen-rushing_2020.csv.gz
2023-07-05 17:04:03,535 - INFO -   + Reading nextgen-rushing_2019.csv.gz
2023-07-05 17:04:03,538 - INFO -   + Reading nextgen-rushing_2021.csv.gz
2023-07-05 17:04:03,541 - INFO -   + Reading nextgen-rushing_2017.csv.gz


CPU times: user 21.1 ms, sys: 3.78 ms, total: 24.9 ms
Wall time: 25.3 ms


### <font color="#9370DB">players</font>

In [16]:
%%time
players_df = load_files('players')
players_df = transform_players(players_df)

2023-07-05 17:04:03,547 - INFO - Reading all files from players
2023-07-05 17:04:03,548 - INFO -   + Reading players.csv
2023-07-05 17:04:03,620 - INFO - Process players dataset...
2023-07-05 17:04:03,621 - INFO - drop players without gsis_ids - they won't link to player_stats
2023-07-05 17:04:03,635 - INFO - fill empty players status to 'NONE'
2023-07-05 17:04:03,640 - INFO - rename gsis_id to player_id...


CPU times: user 81.5 ms, sys: 11.6 ms, total: 93.1 ms
Wall time: 94.3 ms


---

# <font color=teal>store to database so we can perform some SQL operations</font>

In [17]:
def load_all_datasets_to_db(data: dict):
    data['schema'] = database_schema
    load_dims_to_db(data)


In [19]:
%%time
if LOAD_TO_DB:
    datasets.update({
        'players': players_df,
        'player_stats': stats_df,
        'adv_stats_def': advstats_def_df,
        'adv_stats_pass': advstats_pass_df,
        'adv_stats_rec': advstats_rec_df,
        'adv_stats_rush': advstats_rush_df,
        'nextgen_pass': next_pass_df,
        'nextgen_rec': next_rec_df,
        'nextgen_rush': next_rush_df
    })
    load_all_datasets_to_db(datasets)

2023-07-05 17:13:52,536 - INFO - create table play_actions in schema controls
2023-07-05 17:14:23,070 - INFO - create table game_drive in schema controls
2023-07-05 17:14:42,901 - INFO - create table play_analytics in schema controls
2023-07-05 17:15:53,202 - INFO - create table play_situations in schema controls
2023-07-05 17:16:23,157 - INFO - create table play_metrics in schema controls
2023-07-05 17:16:45,654 - INFO - create table player_events in schema controls
2023-07-05 17:16:56,528 - INFO - create table game_info in schema controls
2023-07-05 17:16:56,713 - INFO - create table player_participation in schema controls
2023-07-05 17:21:26,211 - INFO - create table players in schema controls
2023-07-05 17:21:27,803 - INFO - create table player_stats in schema controls
2023-07-05 17:21:44,563 - INFO - create table adv_stats_def in schema controls
2023-07-05 17:21:44,945 - INFO - create table adv_stats_pass in schema controls
2023-07-05 17:21:45,003 - INFO - create table adv_stats_r

CPU times: user 3min 51s, sys: 54 s, total: 4min 45s
Wall time: 7min 54s
