In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
df0 = pd.read_csv('master.txt')

print(df0.shape)
df0.head()

(10076, 19)


Unnamed: 0,player,date,team,home,opp,game,week,day,completions,passatt,passyards,passtds,ints,sacks,sackyards,rushatt,rushyards,rushtds,fumbles
0,Geno Smith,2013-12-01,NYJ,1,MIA,12,13,Sun,4,10,29,0,1,1.0,8.0,1,2,0,0
1,Ryan Tannehill,2013-12-01,MIA,0,NYJ,12,13,Sun,28,43,331,2,1,1.0,3.0,3,22,0,0
2,Brandon Weeden,2013-12-01,CLE,1,JAX,12,13,Sun,24,40,370,3,2,3.0,28.0,2,5,0,2
3,Joe Flacco,2013-11-28,BAL,1,PIT,12,13,Thu,24,35,251,1,0,2.0,14.0,4,7,0,1
4,Matt Flynn,2013-11-28,GNB,0,DET,12,13,Thu,10,20,139,0,1,7.0,37.0,2,4,0,2


In [5]:
# sanitize & engineer
import numpy as np

def wrangle(df):
    df = df.copy()
    
    # fix missing `sacks`/`sackyards`
    df['sacks'] = df['sacks'].fillna(0)
    df['sackyards'] = df['sackyards'].fillna(0)
    
    # remove players with no attempts, sacks, or carries
    haspasses = df['passatt'] > 0
    hassacks = df['sacks'] > 0
    hascarries = df['rushatt'] > 0
    fullcond = haspasses | hassacks | hascarries
    df = df[fullcond]
    
    # remove players with pass yards but no passes,
    # or players with rush yards but no rushes
    haspassyds = df['passyards'] != 0
    hasrushyds = df['rushyards'] != 0
    weirdpass = ~haspasses & ~hassacks & haspassyds
    weirdrush = ~hascarries & hasrushyds
    fullcond = weirdpass | weirdrush
    df = df[~fullcond]
    
    # some players' names have a trailing "*"
    df['player'] = df['player'].str.replace(r'\*$', '')
    
    # turn `date` into a date
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
    
    # engineer a `season`-year feature
    df['season'] = df['date'].apply(lambda x: x.year if x.month > 3 else x.year - 1).astype(int)
    
    # bin the `game` feature into five parts
    # (from early to postseason)
    df['quint'] = pd.cut(
        df['game'],
        bins=np.linspace(0,20,6),
        labels=range(5)
    ).astype(int)
    
    # turn `season` & `quint` into a single
    # sequential `quint` bin (so that, eg,
    # the first part of the 2005 season
    # comes after the postseason of 2004)
    df['quint'] = df['quint'] + (df['season'] - 2004) * 5
    
    # drop some timing-based features
    df = df.drop(columns=['game', 'week', 'day'])
    
    # engineer completion percent & passing efficiency
    # if the QB has no pass attempts, we leave the NaN
    # for use in further processing
    df['cmp%'] = df['completions'] / df['passatt']
    df['netpassyards'] = df['passyards'] - df['sackyards']
    df['netpassatt'] = df['passatt'] + df['sacks']
    df['netcmp%'] = df['completions'] / df['netpassatt']
    df['ny/a'] = df['netpassyards'] / df['netpassatt']
    
    # engineer rushing efficiency
    # again, leave QBs with no carries with NaN
    df['ypc'] = df['rushyards'] / df['rushatt']
    
    # touchdowns & turnovers
    df['tds'] = df['passtds'] + df['rushtds']
    df['tos'] = df['ints'] + df['fumbles']
    
    # abandon old features for new ones
    ignore = ['completions', 'passatt', 'passyards', 'sacks', 'sackyards',
              'passtds', 'ints', 'rushtds', 'fumbles', 'date', 'cmp%', 'season']
    df = df.drop(columns=ignore)
    
    # reorganize features
    order = ['player', 'quint', 'home', 'team', 'opp', 'netcmp%',
             'netpassatt', 'netpassyards', 'ny/a', 'rushatt', 'rushyards',
             'ypc', 'tds', 'tos']
    df = df[order]
    changem = {
        'netcmp%': 'net%',
        'netpassatt': 'netatt',
        'netpassyards': 'ny',
        'rushatt': 'car',
        'rushyards': 'ry'
    }
    df = df.rename(columns=changem)
    
    return df

df1 = wrangle(df0)
print(df1.shape)
df1.head()

(9693, 14)




Unnamed: 0,player,quint,home,team,opp,net%,netatt,ny,ny/a,car,ry,ypc,tds,tos
0,Geno Smith,47,1,NYJ,MIA,0.363636,11.0,21.0,1.909091,1,2,2.0,0,1
1,Ryan Tannehill,47,0,MIA,NYJ,0.636364,44.0,328.0,7.454545,3,22,7.333333,2,1
2,Brandon Weeden,47,1,CLE,JAX,0.55814,43.0,342.0,7.953488,2,5,2.5,3,4
3,Joe Flacco,47,1,BAL,PIT,0.648649,37.0,237.0,6.405405,4,7,1.75,1,1
4,Matt Flynn,47,0,GNB,DET,0.37037,27.0,102.0,3.777778,2,4,2.0,0,3


In [6]:
df1.isnull().sum()

player       0
quint        0
home         0
team         0
opp          0
net%       306
netatt       0
ny           0
ny/a       306
car          0
ry           0
ypc       1758
tds          0
tos          0
dtype: int64

In [7]:
metacols = ['player', 'quint', 'home', 'team', 'opp']
passcols = metacols + ['net%', 'netatt', 'ny', 'ny/a']
rushcols = metacols + ['car', 'ry', 'ypc']
scorcols = metacols + ['tds', 'tos']

passcond = ~df1['net%'].isnull()
rushcond = ~df1['ypc'].isnull()

df_pass = df1[passcond][passcols]
df_rush = df1[rushcond][rushcols]
df_scor = df1[scorcols]

print(df_pass.shape)
df_pass.head()

(9387, 9)


Unnamed: 0,player,quint,home,team,opp,net%,netatt,ny,ny/a
0,Geno Smith,47,1,NYJ,MIA,0.363636,11.0,21.0,1.909091
1,Ryan Tannehill,47,0,MIA,NYJ,0.636364,44.0,328.0,7.454545
2,Brandon Weeden,47,1,CLE,JAX,0.55814,43.0,342.0,7.953488
3,Joe Flacco,47,1,BAL,PIT,0.648649,37.0,237.0,6.405405
4,Matt Flynn,47,0,GNB,DET,0.37037,27.0,102.0,3.777778


In [8]:
print(df_rush.shape)
df_rush.head()

(7935, 8)


Unnamed: 0,player,quint,home,team,opp,car,ry,ypc
0,Geno Smith,47,1,NYJ,MIA,1,2,2.0
1,Ryan Tannehill,47,0,MIA,NYJ,3,22,7.333333
2,Brandon Weeden,47,1,CLE,JAX,2,5,2.5
3,Joe Flacco,47,1,BAL,PIT,4,7,1.75
4,Matt Flynn,47,0,GNB,DET,2,4,2.0


In [9]:
print(df_scor.shape)
df_scor.head()

(9693, 7)


Unnamed: 0,player,quint,home,team,opp,tds,tos
0,Geno Smith,47,1,NYJ,MIA,0,1
1,Ryan Tannehill,47,0,MIA,NYJ,2,1
2,Brandon Weeden,47,1,CLE,JAX,3,4
3,Joe Flacco,47,1,BAL,PIT,1,1
4,Matt Flynn,47,0,GNB,DET,0,3


In [65]:
df_pass.to_csv('clean-pass.txt', index=False)
df_rush.to_csv('clean-rush.txt', index=False)
df_scor.to_csv('clean-score.txt', index=False)