# Prepare

## imports

In [657]:
%load_ext autoreload
%autoreload 2

import warnings

import numpy as np
import pandas as pd
import os
import sys

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.float_format', lambda x: '%.5f' % x)

np.random.seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [658]:
module_path = os.path.abspath(os.path.join('../src'))
print("Adding modules", module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

Adding modules /Users/christopherlomeli/Source/courses/datascience/data_science_capstone02/nfl_capstone/src


In [659]:
from src.features.wrangling.database_loader import DatabaseLoader
from src.features.wrangling.get_metrics import GetMetrics, get_versioned_name, conform_column_names

# setup

In [660]:
RAW_DATA_PATH = '../data/raw'
INTERIM_DATA_PATH='../data/interim'

# inputs
INPUT_DATA=os.path.join(RAW_DATA_PATH,"nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv")

#outputs
OUTPUT_DATA = os.path.join(INTERIM_DATA_PATH,"nflplaybyplay2009to2016_reviewed_01.parquet")
BACKUP_METRICS_FILE = os.path.join(INTERIM_DATA_PATH,"metrics_backup.parquet")
DIMENSIONS_DATA = os.path.join(INTERIM_DATA_PATH,"dimensions.parquet")

# database
METRICS_TABLE_NAME="nfl_metrics"
CATEGORIES_TABLE_NAME="nfl_categories"

# verify
print("INPUT_DATA", INPUT_DATA)
print("OUTPUT_DATA", OUTPUT_DATA )
print("BACKUP_METRICS_FILE", BACKUP_METRICS_FILE )
print("", DIMENSIONS_DATA)
print("DIMENSIONS_DATA", METRICS_TABLE_NAME)
print("CATEGORIES_TABLE_NAME", CATEGORIES_TABLE_NAME)


INPUT_DATA ../data/raw/nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv
OUTPUT_DATA ../data/interim/nflplaybyplay2009to2016_reviewed_01.parquet
BACKUP_METRICS_FILE ../data/interim/metrics_backup.parquet
 ../data/interim/dimensions.parquet
DIMENSIONS_DATA nfl_metrics
CATEGORIES_TABLE_NAME nfl_categories


In [661]:
db = DatabaseLoader(connection_string_env_url="DB_CONNECTION_URL")

In [662]:
# rdf = db.read_table("nfl_dim")
# rdf
# rdf.to_parquet(DIMENSIONS_DATA, engine='fastparquet',  compression='snappy')

In [663]:
# Creating the Series
sr = pd.Series(['Mike', 'Alessa', 'Nick', 'Kim', 'Britney'])

# Creating the index
idx = ['Name 1', 'Name 2', 'Name 3', 'Name 4', 'Name 5']

# set the index
sr.index = idx

# Print the series
sr




Name 1       Mike
Name 2     Alessa
Name 3       Nick
Name 4        Kim
Name 5    Britney
dtype: object

In [664]:
# extract groups having any capital letter
# followed by 'i' and any other character
result = sr.str.extract(pat = '([A-Z]i.)')

result

Unnamed: 0,0
Name 1,Mik
Name 2,
Name 3,Nic
Name 4,Kim
Name 5,


In [665]:
# Import needed package
import pycodestyle

# Create a StyleGuide instance
style_checker = pycodestyle.StyleGuide()

In [666]:
import requests

url = 'https://static.nfl.com/liveupdate/game-center/2019122902/2019122902_gtd.json'
url='https://nfl-team-stats.p.rapidapi.com/v1/nfl-stats/teams/win-stats/2020'

# r = requests.get('https://www.pro-football-reference.com/boxscores/game-scores.htm')



#
# const axios = require("axios");
#
# const options = {
#     method: 'GET',
#     url: 'https://nfl-team-stats.p.rapidapi.com/v1/nfl-stats/teams/win-stats/2020',
#     headers: {
#         'X-RapidAPI-Key': 'c81e11a8a5mshcd003b1576ee39ep1982efjsn42073ec20c43',
#         'X-RapidAPI-Host': 'nfl-team-stats.p.rapidapi.com'
#     }
# };
#
# axios.request(options).then(function (response) {
#     console.log(response.data);
# }).catch(function (error) {
#     console.error(error);
# });

### load spreadscores

In [667]:
scores_df = pd.read_csv("../data/raw/spreadspoke_scores.csv", parse_dates=['schedule_date'])
scores_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,1966-09-02,1966,1,False,Miami Dolphins,14,23,Oakland Raiders,,,,Orange Bowl,False,83.0,6.0,71.0,
1,1966-09-03,1966,1,False,Houston Oilers,45,7,Denver Broncos,,,,Rice Stadium,False,81.0,7.0,70.0,
2,1966-09-04,1966,1,False,San Diego Chargers,27,7,Buffalo Bills,,,,Balboa Stadium,False,70.0,7.0,82.0,
3,1966-09-09,1966,2,False,Miami Dolphins,14,19,New York Jets,,,,Orange Bowl,False,82.0,11.0,78.0,
4,1966-09-10,1966,1,False,Green Bay Packers,24,3,Baltimore Colts,,,,Lambeau Field,False,64.0,8.0,62.0,


In [668]:
scores_df.drop(columns=['team_favorite_id', 'spread_favorite', 'over_under_line', 'weather_detail'], inplace=True)
scores_df['team_away'] = scores_df['team_away'].str.strip()
scores_df['team_home'] = scores_df['team_home'].str.strip()
scores_df.rename(columns={
    'schedule_date': 'date',
    'schedule_season': 'season',
    'schedule_week': 'week',
    'team_home': 'home_team',
    'team_away': 'away_team'
}, inplace=True)

scores_df

Unnamed: 0,date,season,week,schedule_playoff,home_team,score_home,score_away,away_team,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity
0,1966-09-02,1966,1,False,Miami Dolphins,14,23,Oakland Raiders,Orange Bowl,False,83.00000,6.00000,71.00000
1,1966-09-03,1966,1,False,Houston Oilers,45,7,Denver Broncos,Rice Stadium,False,81.00000,7.00000,70.00000
2,1966-09-04,1966,1,False,San Diego Chargers,27,7,Buffalo Bills,Balboa Stadium,False,70.00000,7.00000,82.00000
3,1966-09-09,1966,2,False,Miami Dolphins,14,19,New York Jets,Orange Bowl,False,82.00000,11.00000,78.00000
4,1966-09-10,1966,1,False,Green Bay Packers,24,3,Baltimore Colts,Lambeau Field,False,64.00000,8.00000,62.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13511,2023-01-22,2022,Division,True,Buffalo Bills,10,27,Cincinnati Bengals,Highmark Stadium,False,32.00000,4.00000,100.00000
13512,2023-01-22,2022,Division,True,San Francisco 49ers,19,12,Dallas Cowboys,Levi's Stadium,False,55.00000,19.00000,47.00000
13513,2023-01-29,2022,Conference,True,Kansas City Chiefs,23,20,Cincinnati Bengals,GEHA Field at Arrowhead Stadium,False,22.00000,13.00000,55.00000
13514,2023-01-29,2022,Conference,True,Philadelphia Eagles,31,7,San Francisco 49ers,Lincoln Financial Field,False,52.00000,14.00000,48.00000


## load teams list

In [669]:
RAW_DATA_PATH = '../data/raw'
TEAMS_DATA = os.path.join(RAW_DATA_PATH,"nfl_teams.csv")

team_df = pd.read_csv(TEAMS_DATA)

team_df['team_name'] = team_df['team_name'].str.strip()
team_df['team_id'] = team_df['team_id'].str.strip()
team_df.drop(columns=['team_name_short', 'team_id_pfr','team_conference', 'team_conference_pre2002', 'team_division', 'team_division_pre2002'], inplace=True)

team_df.head()

Unnamed: 0,team_name,team_id
0,Arizona Cardinals,ARI
1,Atlanta Falcons,ATL
2,Baltimore Colts,IND
3,Baltimore Ravens,BAL
4,Boston Patriots,NE


## merge team_ids into score_df

In [670]:
def merge_team_id(scores_df, home_or_away_team):
    id_name = home_or_away_team.strip() + "_id"
    df2 = scores_df.merge(team_df, left_on=home_or_away_team, right_on='team_name', how='left', indicator=True)
    df2.loc[(df2.season < 2016) & (df2['team_id'] == 'JAX'), 'team_id'] = 'JAC'
    df2.rename(columns={'team_id': id_name}, inplace=True)
    cf = df2.loc[( df2[home_or_away_team] != df2.team_name), [home_or_away_team]].sum().item()
    assert cf == 0
    df2.drop(columns=['_merge', 'team_name'], inplace=True)
    return df2

In [671]:
print("scores now has a home_team_id (abbreviation")
scores_df = merge_team_id(scores_df=scores_df, home_or_away_team='home_team')
scores_df[['home_team', 'home_team_id']].head()

scores now has a home_team_id (abbreviation


Unnamed: 0,home_team,home_team_id
0,Miami Dolphins,MIA
1,Houston Oilers,TEN
2,San Diego Chargers,SD
3,Miami Dolphins,MIA
4,Green Bay Packers,GB


In [672]:
print("scores now has a away_team_id (abbreviation")
scores_df = merge_team_id(scores_df=scores_df, home_or_away_team='away_team')
scores_df[['away_team', 'away_team_id']].head()

scores now has a away_team_id (abbreviation


Unnamed: 0,away_team,away_team_id
0,Oakland Raiders,OAK
1,Denver Broncos,DEN
2,Buffalo Bills,BUF
3,New York Jets,NYJ
4,Baltimore Colts,IND


## Load gameplay data

In [673]:

RAW_DATA_PATH = '../data/raw'
INTERIM_DATA_PATH='../data/interim'
USE_CONNECTION="DB_FILENAME_URL"   # DB_FILENAME_URL for csv or DB_CONNECTION_URL for postgres

GAMEPLAY_FACTS_DF_NAME=os.path.join(INTERIM_DATA_PATH, "gameplay_facts_cleaned_01.parquet")

data_df = pd.read_parquet(GAMEPLAY_FACTS_DF_NAME)
data_df.head()

Unnamed: 0_level_0,date,game_id,drive,qtr,down,time,time_under,time_secs,play_time_diff,sideof_field,...,timestamp,def_two_point_key,ex_point_result_key,return_key,tackle_key,two_point_conv_key,penalty_key,playattempted,play_attempted,play_recorded_key
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2009-09-10,2009091000,1,1,0.0,15:00,15,3600.0,0.0,TEN,...,2009-09-10 15:00:00,0,0,1,1,0,0,1,,1
1,2009-09-10,2009091000,1,1,1.0,14:53,15,3593.0,7.0,PIT,...,2009-09-10 14:53:00,0,0,0,1,0,0,1,,1
2,2009-09-10,2009091000,1,1,2.0,14:16,15,3556.0,37.0,PIT,...,2009-09-10 14:16:00,0,0,0,1,0,0,1,,1
3,2009-09-10,2009091000,1,1,3.0,13:35,14,3515.0,41.0,PIT,...,2009-09-10 13:35:00,0,0,0,0,0,0,1,,1
4,2009-09-10,2009091000,1,1,4.0,13:27,14,3507.0,8.0,PIT,...,2009-09-10 13:27:00,0,0,0,0,0,0,1,,1


In [674]:
data_df.dtypes

date                               object
game_id                             int64
drive                               int64
qtr                                 int64
down                              float64
time                               object
time_under                          int64
time_secs                         float64
play_time_diff                    float64
sideof_field                       object
yrdln                             float64
yrdline100                        float64
ydstogo                             int64
ydsnet                              int64
goal_to_go                        float64
first_down                        float64
posteam                            object
defensive_team                     object
desc                               object
play_attempted_key                  int64
yards_gained                        int64
sp_key                              int64
touchdown_key                       int64
safety_key                        

## check out whether scores and gameplay agree
Look at the Detroit Lions 2017 season - both datasets should have 16 games

#### Gameplay dataset 2017 Detroit Lions season

In [675]:
data_df.loc[(data_df.season == 2017) & ((data_df.home_team=='DET') | (data_df.away_team=='DET')) , ['season', 'date', 'game_id','home_team', 'away_team']].groupby(['season', 'date', 'game_id','home_team', 'away_team']).count().sort_values(by='date')

season,date,game_id,home_team,away_team
2017,2017-09-10,2017091004,DET,ARI
2017,2017-09-18,2017091800,NYG,DET
2017,2017-09-24,2017092404,DET,ATL
2017,2017-10-01,2017100106,MIN,DET
2017,2017-10-08,2017100802,DET,CAR
2017,2017-10-15,2017101504,NO,DET
2017,2017-10-29,2017102910,DET,PIT
2017,2017-11-06,2017110600,GB,DET
2017,2017-11-12,2017111202,DET,CLE
2017,2017-11-19,2017111900,CHI,DET


#### Scores dataset 2017 Detroit Lions season

In [676]:
scores_df.loc[(scores_df.season==2017) & ((scores_df.home_team=='Detroit Lions') | (scores_df.away_team=='Detroit Lions')), ['season', 'date', 'home_team_id', 'away_team_id']].sort_values(by='date')

Unnamed: 0,season,date,home_team_id,away_team_id
11883,2017,2017-09-10,DET,ARI
11907,2017,2017-09-18,NYG,DET
11912,2017,2017-09-24,DET,ATL
11934,2017,2017-10-01,MIN,DET
11944,2017,2017-10-08,DET,CAR
11963,2017,2017-10-15,NO,DET
11987,2017,2017-10-29,DET,PIT
12008,2017,2017-11-06,GB,DET
12014,2017,2017-11-12,DET,CLE
12024,2017,2017-11-19,CHI,DET


## conform the game_ids in scores and gameplay datasets for joining on game_id

#### fix : the gameplay data has an incorrect date for the 2014 buffalo jets game

In [677]:
# gameplay has the wrong date for the 2014 buffalo jets game
data_df.loc[(data_df['season'] == 2014) &(data_df['home_team'] == 'BUF') &(data_df['away_team'] == 'NYJ'), 'date' ] = '2014-11-24'

In [678]:
# the 2016 jacksonville jaguars game still has them as 'JAC' in the gameplay data
scores_df.loc[(scores_df.date == '2016-09-11') & (scores_df.home_team == 'Jacksonville Jaguars'), 'home_team_id'] =  'JAC'

In [679]:
# todo - the gameplay data has Jacksonville as 'JAC' up to 2016 - but it was actually changed to JAX in 2013

### add a conformed game_id to each dataset

In [680]:
scores_df['game_id'] = scores_df['date'].astype('string').str.replace("-","")+scores_df.home_team_id.str.lower() + scores_df.away_team_id.str.lower()

In [681]:
data_df['game_id']  = data_df['date'].astype('string').str.replace("-","")+data_df.home_team.str.lower() + data_df.away_team.str.lower()

In [703]:
# try a test merge
test_df = data_df.merge(scores_df, left_on='game_id', right_on='game_id', how='left', indicator=True)


In [704]:
# list any failures
m = test_df.loc[(test_df._merge == 'left_only'), ['date_x', 'game_id','home_team_x', 'away_team_x']]
np.unique(m['date_x'].astype('string')+m['home_team_x']+'-'+m['away_team_x'])

array([], dtype=object)

In [683]:
# validate the merge
assert test_df.loc[test_df._merge == 'left_only'].size == 0
assert test_df.loc[test_df._merge == 'right_only'].size == 0