## Prepare Data

Get seasons wise kickers and pro bowl data

In [1]:
import mysql.connector 
import pandas as pd
import numpy as np
from pandas import DataFrame
import matplotlib.mlab as mlab
from mysql.connector import errorcode
import matplotlib.pyplot as plt
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display, clear_output
from IPython.html.widgets import *
import plotly as py
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
import os
import json
import time
import pickle
import requests
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

%matplotlib inline



In [2]:
CONFIG = {
    'user': 'db_gtown_2018',
    'password': '***',
    'port': '3306',
    'host': 'nflnumbers.czuayagz62va.us-east-1.rds.amazonaws.com',
    'database': 'db_nfl',
    'raise_on_warnings': True,
}

QUERY = """SELECT 
	PBP.FKICKER AS Player_ID,            
    PBP.GID,
    GAME.SEAS AS Season,    
    CASE WHEN LOWER(COND) in ('rain','showers','snow','foggy','hazy','thunderstorms','cold', 'windy','overcast') THEN 1 ELSE 0 END AS Is_Weather_Extreme,
    CASE WHEN LOWER(SURF) LIKE '%turf%' THEN 1 ELSE 0 END AS Is_Turf_Surface,                
    CASE WHEN GOOD = 'Y' THEN 1 ELSE 0 END AS Success,
	CASE WHEN UPPER(DETAIL) LIKE '%BLOCKED%' THEN 1 ELSE 0 END AS Blocked,
	CASE WHEN QTR IN ('2' , '4') AND MIN <= 2 AND (PTSO - PTSD) >= - 3 THEN 1 ELSE 0 END AS Is_High_Pressure,
	CASE WHEN QTR IN ('2' , '4') AND MIN <= 2 AND (PTSO - PTSD) >= - 3 AND GOOD = 'Y' THEN 1 ELSE 0	END AS High_Pressure_Success,
	CASE WHEN DIST >= 50 THEN 1 ELSE 0 END AS Is_Long_Distance,
	CASE WHEN DIST >= 50 AND GOOD = 'Y' THEN 1 ELSE 0 END AS Long_Distance_Success,
	DIST - (100 - CASE WHEN YFOG = '' THEN '98' ELSE YFOG END) AS YDS_Behind_LOS,
    DIST as Distance,
	(GAME.SEAS - PLAYER.start) AS Years_Played,
    TEMP,HUMD,WSPD,
	height,
	weight,
	PLAYER.forty,
	PLAYER.bench,
	PLAYER.vertical,
	PLAYER.broad,
	PLAYER.shuttle,
	PLAYER.cone,
	PLAYER.arm,
	PLAYER.hand,
	PLAYER.dcp,
	DPOS AS Draft_Position,
    CASE WHEN PRO_BOWL.ProBowl_Level IS NULL THEN 0 ELSE 1 END AS In_ProBowlTeam
FROM db_nfl.PBP
LEFT OUTER JOIN db_nfl.PLAYER PLAYER ON PBP.FKICKER = PLAYER.PLAYER
LEFT OUTER JOIN db_nfl.GAME ON PBP.GID = GAME.GID
LEFT OUTER JOIN db_nfl.PRO_BOWL ON PRO_BOWL.PLAYER_ID = PLAYER.PLAYER AND GAME.SEAS = ProBowl_Year 
WHERE POS1 = 'K' 
        ;"""

FEATURES = ['Player_ID','GID','Season','Is_Weather_Extreme','Is_Turf_Surface','Success','Blocked','Is_High_Pressure',
        'High_Pressure_Success','Is_Long_Distance','Long_Distance_Success','YDS_Behind_LOS','Distance',
        'YEARS_PLAYED','TEMP','HUMD','WSPD', 
        'height','weight','forty','bench',
        'vertical','broad','shuttle','cone',
        'arm','hand','dcp','Draft_Position','In_ProBowlTeam']

def fetch_data():   
    try:
        cnx = mysql.connector.connect(**CONFIG)
        cursor = cnx.cursor()
        #Let's read all the rows in the table
        cursor.execute(QUERY)
        #specify the attributes that you want to display
        df = DataFrame(cursor.fetchall())    
        df.columns = FEATURES
        cnx.commit()
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print("Something is wrong with your user name or password")
        elif err.errno == errorcode.ER_BAD_DB_ERROR:
            print("Database does not exist")
        else:
            print(err)
    else:
        
        return df
        cursor.close()
        cnx.close()
DATA = fetch_data()     

In [3]:
DATA.head(3)

Unnamed: 0,Player_ID,GID,Season,Is_Weather_Extreme,Is_Turf_Surface,Success,Blocked,Is_High_Pressure,High_Pressure_Success,Is_Long_Distance,...,bench,vertical,broad,shuttle,cone,arm,hand,dcp,Draft_Position,In_ProBowlTeam
0,WR-0500,1,2000,0,1,1,0,0,0,0,...,0,0.0,0,0.0,0.0,0,0.0,0,0,0
1,MA-0700,1,2000,0,1,1,0,0,0,0,...,0,0.0,0,0.0,0.0,0,0.0,0,86,0
2,MA-0700,1,2000,0,1,1,0,0,0,0,...,0,0.0,0,0.0,0.0,0,0.0,0,86,0


### Missing value treatement

In [4]:
#missing value check
DATA.isnull().sum()

Player_ID                0
GID                      0
Season                   0
Is_Weather_Extreme       0
Is_Turf_Surface          0
Success                  0
Blocked                  0
Is_High_Pressure         0
High_Pressure_Success    0
Is_Long_Distance         0
Long_Distance_Success    0
YDS_Behind_LOS           0
Distance                 0
YEARS_PLAYED             0
TEMP                     0
HUMD                     0
WSPD                     0
height                   0
weight                   0
forty                    0
bench                    0
vertical                 0
broad                    0
shuttle                  0
cone                     0
arm                      0
hand                     0
dcp                      0
Draft_Position           0
In_ProBowlTeam           0
dtype: int64

## Feature analysis & selection

### Check if it is separable classficiation problem

This looks like good separaable classification problem, we can also see the success percentage and counts have more impact on 1s.

In [5]:
# check correlation among quantitaive features and pick only one feature out of two too correlated features
corr_matrix = DATA.corr().abs()
high_corr_var=np.where(corr_matrix>0.8)
high_corr_var=[(corr_matrix.index[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
high_corr_var

[('Is_High_Pressure', 'High_Pressure_Success'),
 ('shuttle', 'cone'),
 ('arm', 'hand')]

Select features after correaltion check and remove 'TEMP', 'HUMD', 'WSPD' becuase we will use COND column which has same data

In [6]:
DATA.columns

Index(['Player_ID', 'GID', 'Season', 'Is_Weather_Extreme', 'Is_Turf_Surface',
       'Success', 'Blocked', 'Is_High_Pressure', 'High_Pressure_Success',
       'Is_Long_Distance', 'Long_Distance_Success', 'YDS_Behind_LOS',
       'Distance', 'YEARS_PLAYED', 'TEMP', 'HUMD', 'WSPD', 'height', 'weight',
       'forty', 'bench', 'vertical', 'broad', 'shuttle', 'cone', 'arm', 'hand',
       'dcp', 'Draft_Position', 'In_ProBowlTeam'],
      dtype='object')

FEATURES_SELECTED =['Player_ID', 'GID', 'Season', 'Is_Weather_Extreme', 'Is_Turf_Surface',
       'Success', 'Blocked', 'Is_High_Pressure', 'High_Pressure_Success',
       'Is_Long_Distance', 'Long_Distance_Success', 'YDS_Behind_LOS',
       'Distance', 'YEARS_PLAYED', 'TEMP', 'HUMD', 'WSPD', 'height', 'weight',
       'forty', 'bench', 'vertical', 'broad', 'shuttle', 'arm',
       'dcp', 'Draft_Position', 'In_ProBowlTeam']
DATA=DATA[FEATURES_SELECTED]

In [7]:
FEATURES_SELECTED =['Player_ID', 'GID', 'Season', 
       'Success', 'Blocked', 'Is_High_Pressure', 'High_Pressure_Success',
       'Is_Long_Distance', 'Long_Distance_Success', 'YDS_Behind_LOS',
       'Distance', 'YEARS_PLAYED', 'In_ProBowlTeam']
DATA=DATA[FEATURES_SELECTED]

In [8]:
DATA.head(10)

Unnamed: 0,Player_ID,GID,Season,Success,Blocked,Is_High_Pressure,High_Pressure_Success,Is_Long_Distance,Long_Distance_Success,YDS_Behind_LOS,Distance,YEARS_PLAYED,In_ProBowlTeam
0,WR-0500,1,2000,1,0,0,0,0,0,18.0,20,2,0
1,MA-0700,1,2000,1,0,0,0,0,0,18.0,43,18,0
2,MA-0700,1,2000,1,0,0,0,0,0,17.0,44,18,0
3,MA-0700,1,2000,1,0,0,0,0,0,18.0,24,18,0
4,MA-0700,1,2000,1,0,0,0,0,0,18.0,44,18,0
5,MA-0700,1,2000,1,0,1,1,0,0,18.0,20,18,0
6,WR-0500,1,2000,1,0,0,0,0,0,18.0,20,2,0
7,MA-0700,1,2000,1,0,1,1,0,0,17.0,48,18,0
8,MA-0700,1,2000,1,0,0,0,0,0,18.0,20,18,0
9,WR-0500,1,2000,1,0,0,0,0,0,18.0,20,2,0


## Prepare data for modeling

In [10]:
# get quantitaive features
dt = DATA
dt.apply(pd.to_numeric, errors='ignore')
import sklearn
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

#data for modeling , before 2016 data
dt_modeling = dt[dt['Season'] <2017].iloc[:,3:]
#train_data=scaler.fit_transform(dt_modeling.iloc[:, 0:-1])
train_data=dt_modeling.iloc[:, 0:-1]
#dt_modeling_data = pd.DataFrame(train_data,index=train_data[:,0])
dt_modeling_data=dt_modeling.iloc[:, 0:-1]
target_modeling =dt_modeling.iloc[:, -1]

#data for out of bag testing of model - 2016/2017 dta
dt_predict = dt[dt['Season']==2017].iloc[:,3:]
dt_predict_data = dt_predict.iloc[:, 0:-1]
target_predict = dt_predict.iloc[:, -1]

In [11]:
import time 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import model_selection
from sklearn.model_selection import train_test_split as tts 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Identify our data and target 
X = dt_modeling_data
y = target_modeling

# Create random train and test splits to avoid bais and overfitting
splits = tts(X, y, test_size=1)
X_train, X_test, y_train, y_test = splits

In [12]:
results=[]
names=[]

def fit_model(model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    start = time.time()
    model.fit(X_train, y_train)
    duration = time.time() - start 
    score = model.score(X_test, y_test)
    
    print("{} fit in {:0.2f} seconds score: {:0.4f}".format(model.__class__.__name__, duration, score))
    print(model.feature_importances_)
    #print(model.get_params()) 
    kfold = model_selection.KFold(n_splits=10, random_state=7)
    scoring = 'accuracy'
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    #print(cv_results)
    names.append(name)
    #msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    #print(msg)


In [18]:
# prepare models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from yellowbrick.classifier import ClassificationReport


# Training classifiers
CART = DecisionTreeClassifier(max_depth=6)
KNN = KNeighborsClassifier(n_neighbors=12)
SV = SVC(kernel='rbf', probability=True)
VC = VotingClassifier(estimators=[('dt', CART), ('knn', KNN), ('svc', SV)], voting='soft', weights=[3,3,3])

ABC = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
                         algorithm="SAMME",
                         n_estimators=500)

models = []
#models.append(('KNN', KNN))
models.append(('CART', CART))
models.append(('SVM', SV))
#models.append(('VC', VC))
models.append(('ABC', ABC))

    
for name,model in models:
    fit_model(model)    
print(FEATURES_SELECTED)    
    # boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

DecisionTreeClassifier fit in 0.10 seconds score: 1.0000
[3.39464009e-02 0.00000000e+00 5.30162385e-04 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.37377976e-01 2.94881563e-02
 7.98657305e-01]
SVC fit in 97.06 seconds score: 1.0000


AttributeError: 'SVC' object has no attribute 'feature_importances_'

In [None]:
seas=2017
dt_predict = dt[dt['Season']==seas].iloc[:,3:]
dt_predict_data = dt_predict.iloc[:, 0:-1]
target_predict = dt_predict.iloc[:, -1]

X = dt_predict_data 
y = target_predict

# 2016 - 36 0:33 1:3
# 2017 - 43 0:40 1:3

def predict_model(model):
    yhat = model.predict(dt_predict_data)
    print(classification_report(y, yhat))
      # Instantiate the classification model and visualizer
    visualizer = ClassificationReport(model, classes=['No', 'Yes'])    
    visualizer.poof() 

    d= DATA.loc[(DATA["Season"]==seas)]
    d['yhat'] = yhat
    #print("Season: {}, 0s: {}, 1s:{}".format(seas,len(X[X['INPROBOWLTEAM'] ==0]),len(X[X['INPROBOWLTEAM'] ==1])))
    unmatched_results = d.loc[( d['yhat'] != d['In_ProBowlTeam'])]
    matched_results = d.loc[( d['yhat'] == d['In_ProBowlTeam'])]
    print("unmatched: count {}, ".format(len(unmatched_results), unmatched_results))
    print("Matched: count {},".format(len(matched_results), matched_results))
    
    #print(d.loc[( d['yhat'] != d['INPROBOWLTEAM']),['FKICKER','GID','INPROBOWLTEAM','yhat']])
    print('Predicted players'.format(d.loc[( d['yhat'] == 1),'Player_ID'].unique()))
    print('Actual playerss'.format(d.loc[( d['In_ProBowlTeam'] == 1),'Player_ID'].unique()))

    
    #print(len(DATA.loc[(DATA["SEASON"]==seas) & (DATA["INPROBOWLTEAM"] == yhat), "INPROBOWLTEAM"]))
    

for name, model in models:
    print('MODEL-'.format(name))
    predict_model(model)  

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

for name, model in models:
    logit_roc_auc = roc_auc_score(y, model.predict(X))
    fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(name + ' - Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()