In [42]:
import warnings
#warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
import collections

In [43]:
STAT_COLUMNS = [
    'st', 'carav', 'g', 
    'cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_int',
    'rush_att', 'rush_yds', 'rush_tds',
    'rec', 'rec_yds', 'rec_tds', 'tkl',
    'def_int'
]

POSITION_DEPENDENT_FEATURES = [
    'cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_int', 'rush_att', 'rush_yds', 'rush_tds', 'rec', 'rec_yds', 'rec_tds', 'tkl', 'def_int', 'sk', 
]


In [44]:
# import and preview dataset
data = pd.read_csv("nfl_draft.csv")
data.head()

Unnamed: 0,column_a,player_id,year,rnd,pick,tm,player,hof,pos,position_standard,...,rush_att,rush_yds,rush_tds,rec,rec_yds,rec_tds,tkl,def_int,sk,college_univ
0,2015Jameis Winston,WinsJa00,2015,1,1,TAM,Jameis Winston,No,QB,QB,...,86.0,311.0,7.0,,,,,,,Florida St.
1,2015Marcus Mariota,MariMa01,2015,1,2,TEN,Marcus Mariota,No,QB,QB,...,79.0,516.0,4.0,1.0,41.0,1.0,,,,Oregon
2,2015Dante Fowler,FowlDa00,2015,1,3,JAX,Dante Fowler,No,OLB,LB,...,,,,,,,16.0,,2.5,Florida
3,2015Amari Cooper,CoopAm00,2015,1,4,OAK,Amari Cooper,No,WR,WR,...,3.0,-3.0,0.0,134.0,1970.0,9.0,,,,Alabama
4,2015Brandon Scherff,ScheBr00,2015,1,5,WAS,Brandon Scherff,No,T,T,...,,,,,,,,,,Iowa


In [45]:
# frequency of null entries in each feature
data.isnull().sum()

column_a                0
player_id            1219
year                    0
rnd                     0
pick                    0
tm                      0
player                  0
hof                     0
pos                     0
position_standard       0
first4av                0
age                  1245
to                   1382
ap1                     0
pb                      0
st                      0
carav                1382
drav                 2165
g                    1415
cmp                  7841
pass_att             7841
pass_yds             7841
pass_td              7841
pass_int             7841
rush_att             6789
rush_yds             6789
rush_tds             6789
rec                  6264
rec_yds              6264
rec_tds              6264
tkl                  4324
def_int              6819
sk                   4388
college_univ         3430
dtype: int64

In [46]:
#Many features have null values. We suspect that the reason is that some
#positions do not do things that other positions do. For example, a linebacker
#will likely never throw a football, so they will have null for cmp
#(completions)

#let's verify and check how many QB's have null completions vs total amount of
#null completions
null_cmps = data[data['cmp'].isnull()]
null_cmps


Unnamed: 0,column_a,player_id,year,rnd,pick,tm,player,hof,pos,position_standard,...,rush_att,rush_yds,rush_tds,rec,rec_yds,rec_tds,tkl,def_int,sk,college_univ
2,2015Dante Fowler,FowlDa00,2015,1,3,JAX,Dante Fowler,No,OLB,LB,...,,,,,,,16.0,,2.5,Florida
3,2015Amari Cooper,CoopAm00,2015,1,4,OAK,Amari Cooper,No,WR,WR,...,3.0,-3.0,0.0,134.0,1970.0,9.0,,,,Alabama
4,2015Brandon Scherff,ScheBr00,2015,1,5,WAS,Brandon Scherff,No,T,T,...,,,,,,,,,,Iowa
5,2015Leonard Williams,WillLe02,2015,1,6,NYJ,Leonard Williams,No,DE,DE,...,,,,,,,56.0,,9,USC
6,2015Kevin White,WhitKe00,2015,1,7,CHI,Kevin White,No,WR,WR,...,1.0,9.0,0.0,19.0,187.0,0.0,,,,West Virginia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8430,1985Raymond Polk,,1985,12,332,RAI,Raymond Polk,No,DB,DB,...,,,,,,,,,Oklahoma St.,
8431,1985Paul Woodside,,1985,12,333,BUF,Paul Woodside,No,K,K,...,,,,,,,,,West Virginia,
8432,1985Dan Lynch,,1985,12,334,DEN,Dan Lynch,No,G,G,...,,,,,,,,,Washington St.,
8433,1985Ray Noble,,1985,12,335,MIA,Ray Noble,No,DB,DB,...,,,,,,,,,California,


In [47]:
null_cmps.loc[null_cmps['pos'] == 'QB']

# as we can see, very few QB positions have null values for cmp while many non
# QB positions do. Because of this, we think it is best to impute a value of 0 
# for the position-dependent features completions, rush attempts, touchdowns, 
# etc) 

Unnamed: 0,column_a,player_id,year,rnd,pick,tm,player,hof,pos,position_standard,...,rush_att,rush_yds,rush_tds,rec,rec_yds,rec_tds,tkl,def_int,sk,college_univ
74,2015Garrett Grayson,GrayGa00,2015,3,75,NOR,Garrett Grayson,No,QB,QB,...,,,,,,,,,,Colorado St.
418,2014Aaron Murray,MurrAa00,2014,5,163,KAN,Aaron Murray,No,QB,QB,...,,,,,,,,,,Georgia
438,2014David Fales,FaleDa00,2014,6,183,CHI,David Fales,No,QB,QB,...,,,,,,,,,,San Jose St.
449,2014Keith Wenning,WennKe00,2014,6,194,BAL,Keith Wenning,No,QB,QB,...,,,,,,,,,,Ball St.
468,2014Tajh Boyd,BoydTa00,2014,6,213,NYJ,Tajh Boyd,No,QB,QB,...,,,,,,,,,,Clemson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8330,1985Steve Calabria,,1985,9,232,TAM,Steve Calabria,No,QB,QB,...,,,,,,,,,Colgate,
8332,1985Paul Berner,,1985,9,234,SDG,Paul Berner,No,QB,QB,...,,,,,,,,,Pacific,
8367,1985Buddy Funck,,1985,10,269,DEN,Buddy Funck,No,QB,QB,...,,,,,,,,,New Mexico,
8375,1985John Conner,,1985,10,277,SEA,John Conner,No,QB,QB,...,,,,,,,,,Arizona,


In [48]:
data[POSITION_DEPENDENT_FEATURES] = data[POSITION_DEPENDENT_FEATURES].fillna(0)
data.isnull().sum()

column_a                0
player_id            1219
year                    0
rnd                     0
pick                    0
tm                      0
player                  0
hof                     0
pos                     0
position_standard       0
first4av                0
age                  1245
to                   1382
ap1                     0
pb                      0
st                      0
carav                1382
drav                 2165
g                    1415
cmp                     0
pass_att                0
pass_yds                0
pass_td                 0
pass_int                0
rush_att                0
rush_yds                0
rush_tds                0
rec                     0
rec_yds                 0
rec_tds                 0
tkl                     0
def_int                 0
sk                      0
college_univ         3430
dtype: int64

In [49]:
data['position_standard'].value_counts()
#LS is a specialized version of C, so we can combine the two into the same position


DB    1569
LB    1124
WR    1049
RB     809
DE     731
DT     640
T      619
G      542
TE     496
QB     398
C      225
K       78
FB      77
P       77
LS       1
Name: position_standard, dtype: int64

In [50]:
data['position_standard'] = data['position_standard'].replace(['LS'], 'C')
data['position_standard'].value_counts()

DB    1569
LB    1124
WR    1049
RB     809
DE     731
DT     640
T      619
G      542
TE     496
QB     398
C      226
K       78
FB      77
P       77
Name: position_standard, dtype: int64

In [51]:
print(data['rnd'].corr(data['drav']))
print(data['rnd'].corr(data['carav']))
print(data['drav'].corr(data['carav']))

#drav and carav basically describe the same thing (career average and draft
#average)
#this can be seen from their correlation value
#we decide to drop drav because corr with rnd lower than carav and they are similar
data[['st', 'g', 'cmp', 'pass_att',
       'pass_yds', 'pass_td', 'pass_int', 'rush_att', 'rush_yds', 'rush_tds',
       'rec', 'rec_yds', 'rec_tds', 'tkl', 'def_int', 'sk']].head()



-0.36528440711228094
-0.3756977577406549
0.8715160708472572


Unnamed: 0,st,g,cmp,pass_att,pass_yds,pass_td,pass_int,rush_att,rush_yds,rush_tds,rec,rec_yds,rec_tds,tkl,def_int,sk
0,2,26.0,540.0,913.0,6722.0,42.0,25.0,86.0,311.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,23.0,458.0,725.0,5590.0,42.0,18.0,79.0,516.0,4.0,1.0,41.0,1.0,0.0,0.0,0.0
2,0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,2.5
3,1,26.0,0.0,0.0,0.0,0.0,0.0,3.0,-3.0,0.0,134.0,1970.0,9.0,0.0,0.0,0.0
4,2,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
'''
Dropping ap1 (Deals with all pro designation), pb (Deals with Pro Bowler
designation), hof (hall of fame designation)
This is future data (received after they were drafted and cannot be used in
prediction)
'''
data.drop(['ap1', 'pb'], axis=1, inplace=True)

#unneeded things like name/player id, team, etc. Does not help with our prediction
data.drop(['column_a','player_id','tm', 'hof', 'college_univ'], axis=1, inplace=True)

In [64]:
def transform(x):
  if(x['year'] <= 1993): #rounds before 1993 did not have 32 rounds per pick. We should standardize to today's standard
    x['rnd'] = 1 + int(x['pick'] / 32)
  return x

data = data.apply(func=transform, axis=1, result_type='broadcast')

#data.drop(['pick']) 
labels = data['rnd']

#pick directly correlates with round. keeping it as a feature would be data leakage
data.drop(['pick'], axis=1, inplace=True)
#labels[labels > 3] = 4

print(data.head())

   year rnd           player  pos position_standard first4av   age      to st  \
0  2015   1   Jameis Winston   QB                QB       13  21.0  2016.0  2   
1  2015   1   Marcus Mariota   QB                QB        9  21.0  2016.0  2   
2  2015   1     Dante Fowler  OLB                LB        0  21.0  2016.0  0   
3  2015   1     Amari Cooper   WR                WR        9  21.0  2016.0  1   
4  2015   1  Brandon Scherff    T                 T        7  23.0  2016.0  2   

  carav  ... pass_int rush_att rush_yds rush_tds    rec rec_yds rec_tds   tkl  \
0  13.0  ...     25.0     86.0    311.0      7.0    0.0     0.0     0.0   0.0   
1   9.0  ...     18.0     79.0    516.0      4.0    1.0    41.0     1.0   0.0   
2   0.0  ...      0.0      0.0      0.0      0.0    0.0     0.0     0.0  16.0   
3   9.0  ...      0.0      3.0     -3.0      0.0  134.0  1970.0     9.0   0.0   
4   7.0  ...      0.0      0.0      0.0      0.0    0.0     0.0     0.0   0.0   

  def_int   sk  
0     0.0

In [None]:
data[['first4av', 'rnd']]

In [None]:
# use counter to get frequency of each label
frequency = collections.Counter(labels)

# printing the frequency to view any class imbalances between the rounds
print(dict(frequency))


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

dtc = DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train, y_train)

from sklearn.naive_bayes import GaussianNB

accuracies = cross_val_score(GaussianNB(), X=features, y=labels.astype('int'), cv=10)

print('Accuracy of model:', accuracy_score(y_test, dtc.predict(X_test)))
print('Accuracy of model (gaussianNB):', sum(accuracies)/len(accuracies))
