In [1]:
import pandas as pd

from NFLVersReader.src.database_loader import DatabaseLoader
from NFLVersReader.src.nflverse_clean.clean_player_stats import impute_player_stats, impute_payers, check_merge
from NFLVersReader.src.nflverse_clean.utils import assert_not_null, assert_and_alert
from configs import get_config


## Player Stats
These stats are critical to determining the state of a team

In [2]:
dbloader = DatabaseLoader(get_config("connection_string"))
stats_df = dbloader.read_table("player_stats")

In [3]:
def check_keys(df):
    assert_not_null(df, 'season')
    assert_not_null(df, 'week')
    assert_not_null(df, 'player_id')
    assert_not_null(df, 'position')
    assert_not_null(df, 'position_group')
    assert_not_null(df, 'season_type')

In [4]:
stats_df = impute_player_stats(stats_df)
check_keys(stats_df)

In [5]:
assert_and_alert(
    assertion=(stats_df.isna().sum().sum() == 0),
    msg="Found unexpected Nulls in player_stats ")

True

## Players
These are not terribly important but let's check anyway

In [6]:
player_df = dbloader.read_table("players")
player_df.head()

Unnamed: 0,index,status,display_name,first_name,last_name,esb_id,gsis_id,suffix,birth_date,college_name,...,status_description_abbr,status_short_description,gsis_it_id,short_name,smart_id,headshot,draft_number,uniform_number,draft_round,season
0,0,RET,'Omar Ellison,'Omar,Ellison,ELL711319,00-0004866,,,,...,,,,,3200454c-4c71-1319-728e-d49d3d236f8f,,,,,
1,1,RES,A'Shawn Robinson,A'Shawn,Robinson,ROB367960,00-0032889,,1995-03-21,Alabama,...,R01,R/Injured,43335.0,A.Robinson,3200524f-4236-7960-bf20-bc060ac0f49c,https://static.www.nfl.com/image/private/f_aut...,46.0,94.0,,
2,2,ACT,A.J. Arcuri,A.J.,Arcuri,ARC716900,00-0037845,,,,...,A01,Active,54726.0,A.Arcuri,32004152-4371-6900-5185-8cdd66b2ad11,,261.0,61.0,,
3,3,RES,A.J. Bouye,Arlandus,Bouye,BOU651714,00-0030228,,1991-08-16,Central Florida,...,R01,R/Injured,40688.0,A.Bouye,3200424f-5565-1714-cb38-07c822111a12,https://static.www.nfl.com/image/private/f_aut...,,24.0,,
4,4,ACT,A.J. Brown,Arthur,Brown,BRO413223,00-0035676,,1997-06-30,Mississippi,...,A01,Active,47834.0,A.Brown,32004252-4f41-3223-e4c5-1e30dffa87f8,https://static.www.nfl.com/image/private/f_aut...,51.0,11.0,,


In [7]:
player_df = impute_payers(player_df)

In [8]:
assert_and_alert(player_df.gsis_id.isna().sum() ==0, msg="player_df.gsis_id has unexpected nulls")

True

## Player Stats + Players

<font color=teal>First, check the player_stats index key is unique by looking for duplicates<font/>

In [27]:
from NFLVersReader.src.nflverse_clean.clean_player_stats import get_duplicates_by_key

assert_and_alert( len( get_duplicates_by_key(stats_df, 'index')) == 0, msg="found unexpected duplicate player_stats index's")

True

<font color=teal>Perform the merge<font/>

In [28]:
merged_df = pd.merge( stats_df, player_df, left_on='player_id', right_on='gsis_id', how='outer', indicator=True)
check_merge(merged_df, stats_df)

The stats dataset has 124081 records
The merged dataset has 139458 records
percent of stats_without_players 0.0
percent of players_without_stats - this is common 0.11026258801933199
percent of matched players and stats 0.889737411980668
percent of stats that were consumed in the join 1.0


<font color=teal>Check for duplicate status index keys<font/>
It's ok to have duplicate players keys because one player will have many stats

In [30]:
assert_and_alert( len( get_duplicates_by_key(merged_df, 'index_x')) == 0, msg="merging player_stats and players resulted in duplicate player_stats")

True

## Boneyard

In [31]:
import numpy as np

# facts
yards_per_carry = np.array([5,2,10,20, 1, 2])
n = len(yards_per_carry)
percentages = np.array([.10, .5, .75, .95])

# given a particular player - what are the potentions that they would be able to rush for a give set of yards?
quartiles = np.quantile(yards_per_carry, percentages)
potentials =  np.array([(q, (np.sum(yards_per_carry > q)/n)) for q in quartiles])

print(potentials)

pf = pd.DataFrame( {
    'minimum_yards': potentials[:, 0],
    'probability': potentials[:,1]
})
pf

[[ 1.5         0.83333333]
 [ 3.5         0.5       ]
 [ 8.75        0.33333333]
 [17.5         0.16666667]]


Unnamed: 0,minimum_yards,probability
0,1.5,0.833333
1,3.5,0.5
2,8.75,0.333333
3,17.5,0.166667


In [32]:
ps = pd.DataFrame( [{
    'min': np.min(yards_per_carry),
    'mean': np.mean(yards_per_carry),
    'median': np.median(yards_per_carry),
    'max': np.max(yards_per_carry),
    'std': np.std(yards_per_carry),
    'quartiles': np.quantile(yards_per_carry, np.array([.10, .75, .95]))
}])
ps


Unnamed: 0,min,mean,median,max,std,quartiles
0,1,6.666667,3.5,20,6.674995,"[1.5, 8.75, 17.5]"
