In [1]:
import pandas as pd
import numpy as np
from kick_cleaner import clean_kick_data, replace_kicker_with_num, kick_dictionary, model_comparison

In [2]:
kick_df = clean_kick_data('field_goals.csv')
kick_df.head()

Unnamed: 0,FieldGoalDistance,HomeTeam,Month,KickerName,FieldGoalResult
0,25.0,DEN,9,J.Tucker,1.0
1,30.0,DEN,9,J.Tucker,1.0
2,48.0,BUF,9,S.Gostkowski,1.0
3,33.0,BUF,9,S.Gostkowski,1.0
4,35.0,BUF,9,S.Gostkowski,1.0


**turns kickers and stadiums into numerical values, change them in the dataframe**

In [3]:
kickers_dict = kick_dictionary(kick_df, 'KickerName')
stadium_dict = kick_dictionary(kick_df, 'HomeTeam')


In [4]:
kick_df1 = replace_kicker_with_num(kick_df, 'KickerName')
kick_df2 = replace_kicker_with_num(kick_df1, 'HomeTeam')

## for log loss kickers/stadius have to be all dummy variables...

In [5]:
kick_df2.head()

Unnamed: 0,FieldGoalDistance,HomeTeam,Month,KickerName,FieldGoalResult
0,25.0,1,9,1,1.0
1,30.0,1,9,1,1.0
2,48.0,2,9,2,1.0
3,33.0,2,9,2,1.0
4,35.0,2,9,2,1.0


### splitting data and testing models 

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_log_error, mean_squared_error

In [7]:
test_df = kick_df2.sample(frac=0.200077, random_state=30)
final_test_x = test_df.iloc[:, :-1]
final_test_y = test_df.iloc[:, -1]

In [8]:
train_df = kick_df2.drop(test_df.index)
y = train_df.iloc[:, -1]
data = train_df.iloc[:, :-1]

### use stage predict for the gradient boost

In [30]:
X_train, X_test, y_train, y_test = train_test_split(data,y)

logistic_model = LogisticRegression()
gradient_boost_model = GradientBoostingClassifier(learning_rate=0.005, max_depth=6, max_features='log2', min_samples_leaf=4, n_estimators=500, subsample=0.25)
random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1)

logistic_model.fit(X_train, y_train)
gradient_boost_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

p_random_forest = random_forest_model.predict_proba(X_test)
p_gradient_boost =  gradient_boost_model.predict_proba(X_test)
p_logistic = logistic_model.predict_proba(X_test)

ensemble_p = (p_random_forest[:,1] + p_gradient_boost[:,1] + p_logistic[:,1])/3

random_forest_ll = log_loss(y_test, p_random_forest )
gradient_boost_ll = log_loss(y_test, p_gradient_boost )
logistic_ll = log_loss(y_test, p_logistic )
ensemble_ll = log_loss(y_test, ensemble_p )

# fig, axs = plot_partial_dependence(gradient_boost_model, X = X_train, features = [0,1,2,3,(1,4)],
#                                        feature_names=list(X_train.columns),
#                                        n_jobs=1, grid_resolution=100, figsize = (20, 20))
# plt.show()                                       


print("Ensemble Log Loss " + str(ensemble_ll))
print("Gradient Boost Log Loss " + str(gradient_boost_ll))
print("Random Forest Log Loss " + str(random_forest_ll))
print("Logistic Log Loss " + str(logistic_ll))

Ensemble Log Loss 0.34886391130202193
Gradient Boost Log Loss 0.35082180764459214
Random Forest Log Loss 0.35575320244857045
Logistic Log Loss 0.345752704359777


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.0s finished


** by log loss, logistic regression is the best **

### using the logistic regression model to predict kick outcomes

predicting kicker probabilities by inputting kicker name, stadium, month

In [11]:
from kick_cleaner import kicker_dataframe

In [12]:
#  stadium_dict
kickers_dict

{'J.Tucker': 1,
 'S.Gostkowski': 2,
 'R.Bironas': 3,
 'M.Bryant': 4,
 'G.Hartley': 5,
 'N.Folk': 6,
 'R.Lindell': 7,
 'S.Hauschka': 8,
 'R.Gould': 9,
 'C.Sturgis': 10,
 'B.Cundiff': 11,
 'B.Walsh': 12,
 'S.Janikowski': 13,
 'P.Dawson': 14,
 'G.Zuerlein': 15,
 'J.Feely': 16,
 'D.Bailey': 17,
 'J.Brown': 18,
 'A.Henery': 19,
 'K.Forbath': 20,
 'R.Bullock': 21,
 'N.Novak': 22,
 'R.Succop': 23,
 'A.Vinatieri': 24,
 'M.Crosby': 25,
 'D.Carpenter': 26,
 'G.Gano': 27,
 'J.Scobee': 28,
 'M.Prater': 29,
 'S.Suisham': 30,
 'M.Nugent': 31,
 'S.Graham': 32,
 'C.Parkey': 33,
 'C.Santos': 34,
 'B.McManus': 35,
 'C.Catanzaro': 36,
 'P.Murray': 37,
 'C.Barth': 38,
 'T.Coons': 39,
 'A.Franks': 40,
 'J.Myers': 41,
 'J.Lambo': 42,
 'D.Hopkins': 43,
 'C.Boswell': 44,
 'R.Aguayo': 45,
 'W.Lutz': 46,
 'Z.Gonzalez': 47,
 'A.Rosas': 48,
 'K.Fairbairn': 49,
 'J.Elliott': 50,
 'H.Butker': 51}

In [13]:
hausch_kicks = kicker_dataframe(kickers_dict, stadium_dict, 'S.Hauschka', 'SEA', 10)
bailey_kicks = kicker_dataframe(kickers_dict, stadium_dict, 'D.Bailey', 'NYG', 10)
santos_kicks = kicker_dataframe(kickers_dict, stadium_dict, 'C.Santos', 'KC', 10)
aguayo_kicks = kicker_dataframe(kickers_dict, stadium_dict, 'R.Aguayo', 'TB', 10)

In [14]:
#essentially makes this but from 25 - 62 yards:

possible_kicks = pd.DataFrame({'FieldGoalDistance': [25, 30, 35, 40, 45, 50, 55, 60],
                              'HomeTeam': [15, 15, 15, 15, 15, 15, 15, 15],
                              'Month': [10, 10, 10, 10, 10, 10, 10, 10],
                              'KickerName': [45, 45, 45, 45, 45, 45, 45, 45]})

In [15]:
#over 90% from 25-39 yards
#over 80% up to 47 yards
#over 70% up to 51 yards
#logistic_model.predict_proba(hausch_kicks)
gradient_boost_model.predict_proba(hausch_kicks)

array([[0.04327739, 0.95672261],
       [0.04507867, 0.95492133],
       [0.05460161, 0.94539839],
       [0.05657557, 0.94342443],
       [0.05532101, 0.94467899],
       [0.0524514 , 0.9475486 ],
       [0.0526989 , 0.9473011 ],
       [0.05734106, 0.94265894],
       [0.0601449 , 0.9398551 ],
       [0.07510225, 0.92489775],
       [0.08327564, 0.91672436],
       [0.09473039, 0.90526961],
       [0.1191399 , 0.8808601 ],
       [0.11208806, 0.88791194],
       [0.1018241 , 0.8981759 ],
       [0.10573096, 0.89426904],
       [0.09698304, 0.90301696],
       [0.10210972, 0.89789028],
       [0.12811124, 0.87188876],
       [0.15526863, 0.84473137],
       [0.16352468, 0.83647532],
       [0.23546601, 0.76453399],
       [0.2181852 , 0.7818148 ],
       [0.22069823, 0.77930177],
       [0.21945052, 0.78054948],
       [0.25055035, 0.74944965],
       [0.25948748, 0.74051252],
       [0.33475134, 0.66524866],
       [0.33667381, 0.66332619],
       [0.39438286, 0.60561714],
       [0.

In [16]:
logistic_model.predict_proba(bailey_kicks)

array([[0.03366066, 0.96633934],
       [0.03709501, 0.96290499],
       [0.04086494, 0.95913506],
       [0.0450001 , 0.9549999 ],
       [0.04953209, 0.95046791],
       [0.05449445, 0.94550555],
       [0.05992263, 0.94007737],
       [0.06585384, 0.93414616],
       [0.07232697, 0.92767303],
       [0.0793823 , 0.9206177 ],
       [0.08706127, 0.91293873],
       [0.09540609, 0.90459391],
       [0.10445923, 0.89554077],
       [0.11426292, 0.88573708],
       [0.12485843, 0.87514157],
       [0.13628527, 0.86371473],
       [0.14858032, 0.85141968],
       [0.16177683, 0.83822317],
       [0.17590326, 0.82409674],
       [0.19098217, 0.80901783],
       [0.20702896, 0.79297104],
       [0.22405064, 0.77594936],
       [0.24204464, 0.75795536],
       [0.26099769, 0.73900231],
       [0.28088487, 0.71911513],
       [0.3016688 , 0.6983312 ],
       [0.32329923, 0.67670077],
       [0.3457128 , 0.6542872 ],
       [0.36883331, 0.63116669],
       [0.39257233, 0.60742767],
       [0.

In [17]:
#over 90% up to 39 yards
#over 80% up to 47 yards
#over 70% up to 51 yards
gradient_boost_model.predict_proba(bailey_kicks)

array([[0.03786365, 0.96213635],
       [0.03862579, 0.96137421],
       [0.04479773, 0.95520227],
       [0.04425974, 0.95574026],
       [0.04344256, 0.95655744],
       [0.04200136, 0.95799864],
       [0.04213916, 0.95786084],
       [0.04871018, 0.95128982],
       [0.04762818, 0.95237182],
       [0.06232319, 0.93767681],
       [0.08151108, 0.91848892],
       [0.08766557, 0.91233443],
       [0.116252  , 0.883748  ],
       [0.12468131, 0.87531869],
       [0.11571477, 0.88428523],
       [0.12692958, 0.87307042],
       [0.1109281 , 0.8890719 ],
       [0.12387214, 0.87612786],
       [0.14077331, 0.85922669],
       [0.15814861, 0.84185139],
       [0.18329837, 0.81670163],
       [0.23325923, 0.76674077],
       [0.22027205, 0.77972795],
       [0.24601172, 0.75398828],
       [0.23158805, 0.76841195],
       [0.29946599, 0.70053401],
       [0.28436499, 0.71563501],
       [0.34445478, 0.65554522],
       [0.33876545, 0.66123455],
       [0.37054635, 0.62945365],
       [0.

In [32]:
# #over 90% up to 38 yards
# #over 80% up to 45 yards
# #over 70% up to 50 yards
# logistic_model.predict_proba(santos_kicks)
# gradient_boost_model.predict_proba(santos_kicks)
# # random_forest_model.predict_proba(santos_kicks)

In [31]:
# #over 90% up to 37 yards
# #over 80% up to 43 yards
# #over 70% up to 49 yards
# logistic_model.predict_proba(aguayo_kicks)

Find a way to compare the kickers more?

In [None]:
gbc2 = GradientBoostingClassifier()

# run more gradient boost models with last 30 percent

is there a python package for scraping NFL data