In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import nflFunctions as f

%matplotlib qt5

In [2]:
# Import data
games_df = pd.read_csv(Path.cwd() / 'data' /'games.csv')
players_df = pd.read_csv(Path.cwd() / 'data' /'players.csv')
plays_df = pd.read_csv(Path.cwd() / 'data' /'plays.csv')
tracking_df = pd.read_csv(Path.cwd() / 'data' /'week1.csv')
coverage_df = pd.read_csv(Path.cwd() / 'data' /'coverages_week1.csv')

# Create coverage mapping dict
mapping_dict = {"Cover 0 Man": 0, "Cover 1 Man": 1,  # Boolean replacements for no/yes
                "Cover 2 Man":2, "Cover 2 Zone":3,
                "Cover 3 Zone":4, "Cover 4 Zone":5,
                "Cover 6 Zone":6, "Prevent Zone":7}  # Boolean replacements for abnormal/normal

# Just Zone or Man
# mapping_dict = {"Cover 0 Man": 0, "Cover 1 Man": 0,  # Boolean replacements for no/yes
#                 "Cover 2 Man":0, "Cover 2 Zone":1,
#                 "Cover 3 Zone":1, "Cover 4 Zone":1,
#                 "Cover 6 Zone":1, "Prevent Zone":1}  #

# Apply the mapping dict to the data
coverage_df.replace(mapping_dict, inplace=True)

# Merge coverage, games and plays df
total_df = pd.merge(pd.merge(coverage_df, games_df, on=['gameId']), plays_df, on=['playId','gameId'])
total_df['scrimmageLine'] = total_df.absoluteYardlineNumber
total_df = total_df[total_df.playType != 'play_type_unknown']

# Important vectors to use
np.random.seed(0)
defList = ['CB','DB','DE','DT','FS', 'ILB', 'LB','LS','MLB','NT','OLB', 'S', 'SS']
xBin = [0,2,4,6,8,10,12,14,16,18,20,25,30,50,100]
yBin = [0,8,14,20,26,32,38,44,53]
# labels = ['tight', 'close', 'medium', 'far', 'deep', 'extreme_deep']

# Create train/test split
split = np.random.rand(len(total_df)) < 0.7
train_df = total_df[split]
test_df = total_df[~split]

# Get tracking data
tracking_train = pd.merge(tracking_df, train_df[['gameId','playId']])
tracking_test = pd.merge(tracking_df, test_df[['gameId','playId']])

# train data preprocessing
total_train_df = pd.merge(train_df, tracking_train, on=['playId','gameId'])
total_train_df.drop(columns=['playDescription'], inplace=True)
total_train_df = total_train_df[(total_train_df.event == 'ball_snap') & (total_train_df.position.isin(defList))]
total_train_df['absXdiff'] = abs(total_train_df.scrimmageLine - total_train_df.x)
total_train_df['defenderY'] = (pd.cut(total_train_df.y, yBin, labels=False, retbins=True, right=False))[0]
total_train_df = pd.get_dummies(total_train_df, columns=['defenderY']) 
total_train_df['defenderX'] = (pd.cut(total_train_df.absXdiff, xBin, labels=False, retbins=True, right=False))[0]
total_train_df = pd.get_dummies(total_train_df, columns=['defenderX']) 

# test data preprocessing
total_test_df = pd.merge(test_df, tracking_test, on=['playId','gameId'])
total_test_df.drop(columns=['playDescription'], inplace=True)
total_test_df = total_test_df[(total_test_df.event == 'ball_snap') & (total_test_df.position.isin(defList))]
total_test_df['absXdiff'] = abs(total_test_df.scrimmageLine - total_test_df.x)
total_test_df['defenderY'] = (pd.cut(total_test_df.y, yBin, labels=False, retbins=True, right=False))[0]
total_test_df = pd.get_dummies(total_test_df, columns=['defenderY']) 
total_test_df['defenderX'] = (pd.cut(total_test_df.absXdiff, xBin, labels=False, retbins=True, right=False))[0]
total_test_df = pd.get_dummies(total_test_df, columns=['defenderX']) 


# Delete the smaller dfs
df_list = [games_df, plays_df, tracking_df, coverage_df, total_df, tracking_test, split, tracking_train, ]
del games_df, plays_df, tracking_df, coverage_df, total_df, tracking_test, split,tracking_train, 
del df_list

In [3]:
cols = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12','x13','y0','y1','y2','y3','y4','y5','y6','y7']

# Get training defender counts
summary_train = total_train_df.groupby(['gameId', 'playId']).sum()
summary_train = summary_train.iloc[:,27:49]
summary_train.columns = cols
train_df = pd.merge(train_df, summary_train, on=['gameId','playId'])

# Get testing defender counts
summary_test = total_test_df.groupby(['gameId', 'playId']).sum()
summary_test = summary_test.iloc[:,27:49]
summary_test['defenderX_13'] = np.zeros(summary_test.shape[0])
summary_test.columns = cols
test_df = pd.merge(test_df, summary_test, on=['gameId','playId'])

In [5]:
# train_df['pointdiff'] = abs(train_df.preSnapHomeScore-train_df.preSnapVisitorScore)
train_df = pd.get_dummies(train_df,columns=['personnelD'])
# test_df['pointdiff'] = abs(test_df.preSnapHomeScore-test_df.preSnapVisitorScore)
test_df = pd.get_dummies(test_df,columns=['personnelD'])

# fill in missing cols for test_df
missing_dummy_columns = train_df.columns[55:].difference(test_df.columns[55:])
for i in range(missing_dummy_columns.shape[0]):
    test_df[missing_dummy_columns[i]] = 0

In [21]:
X_train = train_df.iloc[:,33:]
y_train = train_df.iloc[:,2]
X_test = test_df.iloc[:,33:]
y_test = test_df.iloc[:,2]

In [22]:
X_train.drop(columns=['y0','y1','y2','y3','y4','y5','y6','y7'], inplace=True)
X_test.drop(columns=['y0','y1','y2','y3','y4','y5','y6','y7'], inplace=True)

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.svm import LinearSVC

In [24]:
cart_model = DecisionTreeClassifier(max_depth=10)
train_score, test_score = f.fit_and_score_model(cart_model, X_train, X_test, y_train, y_test)

the accuracy on the: 
	 training data is 0.816
	 testing data is 0.35


In [25]:
rf_model = RandomForestClassifier(n_estimators = 250, max_samples = 0.2)
train_score, test_score = f.fit_and_score_model(rf_model, X_train, X_test, y_train, y_test)

the accuracy on the: 
	 training data is 0.726
	 testing data is 0.465


In [26]:
logregcv = LogisticRegression(max_iter=9000, class_weight=None, solver='liblinear', penalty='l2')
train_score, test_score = f.fit_and_score_model(logregcv, X_train, X_test, y_train, y_test)

the accuracy on the: 
	 training data is 0.52
	 testing data is 0.493


In [27]:
ada = AdaBoostClassifier(n_estimators=250, learning_rate=0.28)
train_score, test_score = f.fit_and_score_model(ada, X_train, X_test, y_train, y_test)

the accuracy on the: 
	 training data is 0.327
	 testing data is 0.315


In [28]:
grad = GradientBoostingClassifier(n_estimators=100, learning_rate=0.30, criterion='mse')
train_score, test_score = f.fit_and_score_model(grad, X_train, X_test, y_train, y_test)

the accuracy on the: 
	 training data is 0.971
	 testing data is 0.374


In [29]:
# mdl = svm.SVC(decision_function_shape='ovo')
mdl = LinearSVC(max_iter=10000, tol=1e-8)
train_score, test_score = f.fit_and_score_model(mdl, X_train, X_test, y_train, y_test)

the accuracy on the: 
	 training data is 0.53
	 testing data is 0.448


In [12]:
# bayes = CategoricalNB()
# train_score, test_score = f.fit_and_score_model(bayes, X_train, X_test, y_train, y_test)

In [14]:
# gid = 2018090901
# pid = 2055
# 
# # Just for test purposes
# play = total_train_df[(total_train_df.gameId == gid) & (total_train_df.playId == pid)]
# fig, ax = f.create_football_field(highlight_line=True,  highlight_line_number=play.scrimmageLine.values[0])                                  
# play.plot(x='x', y='y', kind='scatter', ax=ax, color='blue', s=30, legend='Defense')
# plt.legend()
# plt.show()

No handles with labels found to put in legend.
