In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Initialize the logistic regression model with max iterations increased to ensure convergence.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Generate the confusion matrix to evaluate the performance of the model.
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
# Perform 5-fold cross-validation to assess the stability and performance of the model.
from sklearn.model_selection import cross_val_score
# Recall is chosen as the evaluation metric because we want to minimize false negatives. In this case, a false negative would be anticipating a running play when it's actually a pass.
from sklearn.metrics import recall_score

In [74]:
# I wanted to be able to see all columns

pd.set_option('display.max_columns', None)

In [75]:
# Read in data to a Pandas DataFrame

plays = pd.read_csv('/content/drive/MyDrive/DataBowl/plays.csv')
plays.head()

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,playNullifiedByPenalty,absoluteYardlineNumber,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,expectedPoints,offenseFormation,receiverAlignment,playClockAtSnap,passResult,passLength,targetX,targetY,playAction,dropbackType,dropbackDistance,passLocationType,timeToThrow,timeInTackleBox,timeToSack,passTippedAtLine,unblockedPressure,qbSpike,qbKneel,qbSneak,rushLocationType,penaltyYards,prePenaltyYardsGained,yardsGained,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPointsAdded,isDropback,pff_runConceptPrimary,pff_runConceptSecondary,pff_runPassOption,pff_passCoverage,pff_manZone
0,2022102302,2655,(1:54) (Shotgun) J.Burrow pass short middle to...,3,1,10,CIN,ATL,CIN,21,01:54,35,17,N,31,0.982017,0.017983,0.719313,EMPTY,3x2,10.0,C,6.0,36.69,16.51,False,TRADITIONAL,2.4,INSIDE_BOX,2.99,2.99,,False,False,False,0,,,,9,9,0.004634,-0.004634,0.702717,True,,,0,Cover-3,Zone
1,2022091809,3698,(2:13) (Shotgun) J.Burrow pass short right to ...,4,1,10,CIN,DAL,CIN,8,02:13,17,17,N,18,0.424356,0.575644,0.607746,EMPTY,3x2,9.0,C,4.0,20.83,20.49,False,TRADITIONAL,1.14,INSIDE_BOX,1.836,1.836,,False,False,False,0,,,,4,4,0.002847,-0.002847,-0.240509,True,,,0,Quarters,Zone
2,2022103004,3146,(2:00) (Shotgun) D.Mills pass short right to D...,4,3,12,HOU,TEN,HOU,20,02:00,3,17,N,30,0.006291,0.993709,-0.291485,SHOTGUN,2x2,12.0,C,-4.0,26.02,17.56,False,TRADITIONAL,3.2,INSIDE_BOX,2.236,2.236,,False,False,False,0,,,,6,6,0.000205,-0.000205,-0.21848,True,,,0,Quarters,Zone
3,2022110610,348,(9:28) (Shotgun) P.Mahomes pass short left to ...,1,2,10,KC,TEN,TEN,23,09:28,0,0,N,33,0.884223,0.115777,4.249382,SHOTGUN,2x2,11.0,C,-6.0,38.95,14.19,False,TRADITIONAL,3.02,INSIDE_BOX,2.202,2.202,,False,False,False,0,,,,4,4,-0.001308,0.001308,-0.427749,True,,,0,Quarters,Zone
4,2022102700,2799,(2:16) (Shotgun) L.Jackson up the middle to TB...,3,2,8,BAL,TB,TB,27,02:16,10,10,N,37,0.410371,0.589629,3.928413,PISTOL,3x1,8.0,,,,,True,DESIGNED_RUN,2.03,,,,,,,,0,False,INSIDE_LEFT,,-1,-1,0.027141,-0.027141,-0.638912,False,MAN,READ OPTION,0,Cover-1,Man


In [76]:
# Clean our data, transform useful features into usable formats

plays['isNearOwnSide'] = plays.apply(lambda col: 1 if col['possessionTeam'] == col['yardlineSide'] else 0, axis=1)
plays['time_left_in_game'] = plays.apply(lambda row: (4 - row['quarter']) * 900 + int(row['gameClock'].split(':')[0]) * 60 + int(row['gameClock'].split(':')[1]), axis=1)
plays['isPass'] = plays['passResult'].apply(lambda x: 1 if pd.notna(x) else 0)

In [77]:
# Drop irrelevant features, one hot encode catagorical ones

plays.drop(['gameId', 'playId', 'playDescription', 'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'passResult', 'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'playNullifiedByPenalty', 'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability', 'expectedPoints', 'passResult', 'passLength', 'targetX', 'targetY','playAction', 'dropbackType','dropbackDistance', 'passLocationType', 'timeToThrow', 'timeInTackleBox', 'timeToSack', 'passTippedAtLine', 'unblockedPressure', 'qbSpike', 'qbKneel', 'qbSneak', 'rushLocationType', 'penaltyYards', 'prePenaltyYardsGained', 'yardsGained', 'homeTeamWinProbabilityAdded', 'visitorTeamWinProbilityAdded', 'expectedPointsAdded', 'isDropback', 'pff_runConceptPrimary', 'pff_runConceptSecondary', 'pff_runPassOption'], axis=1, inplace=True)
plays = pd.get_dummies(plays, columns=['offenseFormation','receiverAlignment', 'pff_passCoverage', 'pff_manZone'])
plays.replace({True: 1, False: 0}, inplace=True)

  plays.replace({True: 1, False: 0}, inplace=True)


In [78]:
# Compute correlation matrix
corr_matrix = plays.corr().abs()

# Select the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with a correlation greater than 0.8
high_corr_features = [column for column in upper.columns if any(upper[column] > 0.8)]

# Drop highly correlated features
plays.drop(columns=high_corr_features, inplace=True)

print(f"Features removed due to high correlation: {high_corr_features}")

Features removed due to high correlation: ['time_left_in_game', 'receiverAlignment_3x2', 'pff_manZone_Man', 'pff_manZone_Other', 'pff_manZone_Zone']


In [79]:
# Drop any row with NA values before we train

plays.dropna(inplace=True)

In [84]:
# Seperate our features from what we want to predict

X = plays.drop(columns=['isPass'])
y = plays['isPass']

In [85]:
# Split data into training and testing buckets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32605)

In [86]:
# Standardize features by removing the mean and scaling to unit variance for better model performance.
scaler = StandardScaler()
# Fit and transform the training data using the scaler.
X_train_scaled = scaler.fit_transform(X_train)
# Transform the test data using the previously fitted scaler.
X_test_scaled = scaler.transform(X_test)

# Initialize the logistic regression model to predict binary value of pass (1) or not pass (0).
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# Make predictions on the test set.
y_pred = log_reg.predict(X_test_scaled)

# Generate the confusion matrix to evaluate the performance of the model.
conf_matrix = confusion_matrix(y_test, y_pred)
# Display the confusion matrix to see the counts of true positives, true negatives, false positives, and false negatives.
print('Confusion Matrix:')
print(conf_matrix)

# Calculate recall score
recall = recall_score(y_test, y_pred)

# Display the recall score
print(f'Recall: {recall:.2f}')

Confusion Matrix:
[[ 820  458]
 [ 350 1597]]
Recall: 0.82


In [87]:
# Perform 5-fold cross-validation to assess the stability and performance of the model.
cv_scores = cross_val_score(log_reg, X_train_scaled, y_train, cv=5)
print(f'Cross-validation scores: {cv_scores}')
# Output the cross-validation scores and their average to evaluate model performance.
print(f'Average cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.73682171 0.74263566 0.74728682 0.74137263 0.73167895]
Average cross-validation score: 0.7399591512845253
