In [1]:
import pandas as pd
from NFLVersReader.src.nflverse_clean.utils import assert_not_null, assert_and_alert


In [2]:
injuries_df = pd.read_csv("../../output/injuries_2021.csv", low_memory=False, parse_dates=['date_modified'])


In [3]:
injuries_df.dtypes

season                                     int64
game_type                                 object
team                                      object
week                                       int64
gsis_id                                   object
position                                  object
full_name                                 object
first_name                                object
last_name                                 object
report_primary_injury                     object
report_secondary_injury                   object
report_status                             object
practice_primary_injury                   object
practice_secondary_injury                 object
practice_status                           object
date_modified                datetime64[ns, UTC]
dtype: object

In [4]:
injuries_df.head()

Unnamed: 0,season,game_type,team,week,gsis_id,position,full_name,first_name,last_name,report_primary_injury,report_secondary_injury,report_status,practice_primary_injury,practice_secondary_injury,practice_status,date_modified
0,2021,REG,ARI,1,00-0033258,TE,Darrell Daniels,Darrell,Daniels,,,,Toe,,Full Participation in Practice,2021-09-10 19:35:39+00:00
1,2021,REG,ARI,1,00-0034473,LB,Dennis Gardeck,Dennis,Gardeck,Knee,Hand,Out,Knee,Hand,Did Not Participate In Practice,2021-09-10 19:36:51+00:00
2,2021,REG,ARI,1,00-0035126,WR,Antoine Wesley,Antoine,Wesley,Illness,,Out,Illness,,Did Not Participate In Practice,2021-09-10 19:37:05+00:00
3,2021,REG,ATL,1,00-0031583,DT,Grady Jarrett,Grady,Jarrett,,,,Not injury related - personal matter,,Did Not Participate In Practice,2021-09-10 09:29:27+00:00
4,2021,REG,ATL,1,00-0030010,LB,Brandon Copeland,Brandon,Copeland,,,,Hamstring,,Full Participation in Practice,2021-09-10 18:05:56+00:00


In [5]:
def check_keys(df):
    assert_not_null(df, 'season')
    assert_not_null(df, 'week')
    assert_not_null(df, 'player_id')
    assert_not_null(df, 'team')
    assert_not_null(df, 'position')
    assert_not_null(df, 'report_status')


In [6]:
# conform names
injuries_df.rename(columns={'gsis_id': 'player_id'}, inplace=True)


In [7]:
injuries_df['primary_injury'] = injuries_df['report_primary_injury'].fillna(injuries_df['practice_primary_injury'])

In [8]:
injuries_df.report_status.value_counts(dropna=False)


NaN             3020
Questionable    1513
Out              888
Doubtful         166
Name: report_status, dtype: int64

In [9]:
injuries_df.loc[(injuries_df.report_status.isna()) & (injuries_df.primary_injury.str.lower().str.contains('resting')), 'report_status'] = 'Resting'
injuries_df.loc[(injuries_df.report_status.isna()) & (injuries_df.primary_injury.str.lower().str.contains('personal')), 'report_status'] = 'Personal'
injuries_df.loc[(injuries_df.report_status.isna()) & (injuries_df.practice_status.str.lower().str.contains('full participation')), 'report_status'] = 'Optimistic'
injuries_df.loc[(injuries_df.report_status.isna()) & (injuries_df.practice_status.str.lower().str.contains('did not participate')), 'report_status'] = 'Doubtful'
injuries_df.loc[(injuries_df.report_status.isna()) & (injuries_df.practice_status.str.lower().str.contains('limited participation')), 'report_status'] = 'Questionable'
injuries_df.loc[(injuries_df.report_status.isna()), 'report_status'] = 'Uncertain'

In [10]:
injuries_df.loc[(injuries_df.report_status.isna()), 'primary_injury']

Series([], Name: primary_injury, dtype: object)

In [11]:
check_keys(injuries_df)

In [12]:
injuries_df.report_status.value_counts()

Optimistic      1943
Questionable    1853
Out              888
Resting          501
Doubtful         331
Personal          71
Name: report_status, dtype: int64

In [13]:
injuries_df.loc[(injuries_df.report_status == 'Uncertain'),]

Unnamed: 0,season,game_type,team,week,player_id,position,full_name,first_name,last_name,report_primary_injury,report_secondary_injury,report_status,practice_primary_injury,practice_secondary_injury,practice_status,date_modified,primary_injury


In [15]:
positions = pd.read_csv("../../../nfl_capstone/data/raw/positions.csv")
index = set(positions.Pos)
index

{'B',
 'BB',
 'C',
 'CB',
 'DB',
 'DE',
 'DG',
 'DL',
 'DT',
 'E',
 'FB',
 'FL',
 'FS',
 'G',
 'HB',
 'ILB',
 'K',
 'KR',
 'LB',
 'LCB',
 'LDE',
 'LDH',
 'LDT',
 'LE',
 'LG',
 'LH',
 'LILB',
 'LLB',
 'LOLB',
 'LOT',
 'LS',
 'LT',
 'MG',
 'MIKE',
 'MLB',
 'NT',
 'OG',
 'OL',
 'OLB',
 'OT',
 'P',
 'PR',
 'QB',
 'RB',
 'RCB',
 'RDE',
 'RDH',
 'RDT',
 'RE',
 'RET',
 'RG',
 'RH',
 'RILB',
 'RLB',
 'ROLB',
 'ROT',
 'RS',
 'RT',
 'RUSH',
 'S',
 'SAM',
 'SE',
 'SLB',
 'SS',
 'T',
 'TB',
 'TE',
 'WB',
 'WILL',
 'WLB',
 'WR'}

In [26]:

positions = pd.read_csv("../../../nfl_capstone/data/raw/positions.csv")
index = set(positions.Pos)

df = injuries_df
column_name ='position'

bad_set = set(df.loc[(~df[column_name].isin(index)), column_name].to_list())
assert_and_alert(len(bad_set) == 0, msg=f"Unknown player positions: {bad_set}")


True

In [27]:
positions.head()

Unnamed: 0,Pos,Meaning
0,B,Back
1,BB,Blocking Back
2,C,Center
3,CB,Cornerback
4,DB,Defensive Back
