In [None]:
import warnings
#warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
import collections

In [None]:
STAT_COLUMNS = [
    'st', 'carav', 'g', 
    'cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_int',
    'rush_att', 'rush_yds', 'rush_tds',
    'rec', 'rec_yds', 'rec_tds', 'tkl',
    'def_int'
]

POSITION_DEPENDENT_FEATURES = [
    'cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_int', 'rush_att', 'rush_yds', 'rush_tds', 'rec', 'rec_yds', 'rec_tds', 'tkl', 'def_int', 'sk', 
]


In [None]:
# import and preview dataset
data = pd.read_csv("nfl_draft.csv")
data.head()

In [None]:
# frequency of null entries in each feature
data.isnull().sum()

In [None]:
#Many features have null values. We suspect that the reason is that some
#positions do not do things that other positions do. For example, a linebacker
#will likely never throw a football, so they will have null for cmp
#(completions)

#let's verify and check how many QB's have null completions vs total amount of
#null completions
null_cmps = data[data['cmp'].isnull()]
null_cmps


In [None]:
null_cmps.loc[null_cmps['pos'] == 'QB']

# as we can see, very few QB positions have null values for cmp while many non
# QB positions do. Because of this, we think it is best to impute a value of 0 
# for the position-dependent features completions, rush attempts, touchdowns, 
# etc) 

In [None]:
data[POSITION_DEPENDENT_FEATURES] = data[POSITION_DEPENDENT_FEATURES].fillna(0)
data.isnull().sum()

In [None]:
data['position_standard'].value_counts()
#LS is a specialized version of C, so we can combine the two into the same position


In [None]:
data['position_standard'] = data['position_standard'].replace(['LS'], 'C')
data['position_standard'].value_counts()

In [None]:
def transform(x):
  if(x['year'] <= 1993): #rounds before 1993 did not have 32 rounds per pick. We should standardize to today's standard
    x['rnd'] = 1 + int(x['pick'] / 32)
  return x

data = data.apply(func=transform, axis=1, result_type='broadcast')



In [None]:
rounds_as_ints = data['rnd']
data.loc[rounds_as_ints <= 3, 'rnd'] = '1-3'
data.loc[rounds_as_ints > 3 and data['rnd'] <= 6 , 'rnd'] = '4-6'
data.loc[rounds_as_ints > 6, 'rnd'] = '>7'
data

In [None]:
labels = data['rnd']

#pick directly correlates with round. keeping it as a feature would be data leakage
features = data.drop(['pick'], axis=1)
print(features.head())

In [None]:
print(data['rnd'].corr(data['drav']))
print(data['rnd'].corr(data['carav']))
print(data['drav'].corr(data['carav']))

#drav and carav basically describe the same thing (career average and draft
#average)
#this can be seen from their correlation value
#we decide to drop drav because corr with rnd lower than carav and they are similar
data[['st', 'g', 'cmp', 'pass_att',
       'pass_yds', 'pass_td', 'pass_int', 'rush_att', 'rush_yds', 'rush_tds',
       'rec', 'rec_yds', 'rec_tds', 'tkl', 'def_int', 'sk']].head()



In [None]:
'''
Dropping ap1 (Deals with all pro designation), pb (Deals with Pro Bowler
designation), hof (hall of fame designation)
This is future data (received after they were drafted and cannot be used in
prediction)
'''
features = data.drop(['ap1', 'pb', 'to'], axis=1)

#unneeded  things like name/player id, team, etc. Does not help with our prediction
features.drop(['column_a','player_id','tm', 'hof', 'player', 'pos'], axis=1, inplace=True)

#drop things with too many nulls that cannot be imputed
features.drop(['college_univ'], axis=1, inplace=True)

print(features.columns)

In [None]:
null_cmps.loc[null_cmps['pos'] == 'QB']

In [None]:
features[['first4av', 'rnd']]
plt.scatter(x=features['first4av'], y=features['rnd'], marker='x')

In [None]:
one_hot = pd.get_dummies(features['position_standard'])
features = features.drop('position_standard', axis=1)
features = features.join(one_hot)
features

In [None]:
# use counter to get frequency of each label
frequency = collections.Counter(labels)

# printing the frequency to view any class imbalances between the rounds
print(dict(frequency))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
# params = {"max_depth": [5,10,15,20], "min_samples_leaf": [5,10,15,20]}
# grid_search = GridSearchCV(clf, params, cv=5, scoring='accuracy') #inner loop
# replace clf with grid_search if you want to test parameters
nested_score = cross_val_score(clf, features, labels, cv=5) #outer loop
print("Accuracy:", nested_score.mean()*100)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
scores = cross_val_score(clf, features, labels, cv=10)                                       
print("Accuracy:", scores.mean()*100)

#Alternative (cross_val_predict instead of cross_val_score) to analyze the results in more detail:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
clf = GaussianNB()
predicts = cross_val_predict(clf, features, labels, cv=10) 
print("Predictions:", predicts) 
print("Confusion Matrix:\n", confusion_matrix(labels, predicts))
print("Report:\n", classification_report(labels, predicts))