In [1]:
import pandas as pd
import numpy as np
import pymc as pm

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/cnickol26/BigDataBowl2023/main/cluster_data.csv')

In [4]:
data

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,perc_tackle_lt_1.5,perc_tackl_end_facing,End_dist_travel,Mean_end_speed
0,202109090097,0.0,0,0.578947,0.421053,13.31,3.191316
1,202109090097,1.0,0,0.710526,0.052632,11.74,3.073684
2,202109120163,0.0,0,0.592593,0.111111,8.49,3.167407
3,202109120163,1.0,0,0.555556,0.296296,7.51,2.728889
4,202109120288,0.0,0,0.739130,0.086957,6.77,2.684783
...,...,...,...,...,...,...,...
9040,20211025003684,1.0,0,0.740000,0.400000,14.49,2.835600
9041,20211025003904,0.0,0,1.000000,0.612903,9.05,2.853548
9042,20211025003904,1.0,0,0.677419,0.290323,9.33,2.933226
9043,20211025003926,0.0,0,0.906977,0.162791,6.47,1.487442


In [5]:
data_features = data.iloc[:,-4:]
data_features

Unnamed: 0,perc_tackle_lt_1.5,perc_tackl_end_facing,End_dist_travel,Mean_end_speed
0,0.578947,0.421053,13.31,3.191316
1,0.710526,0.052632,11.74,3.073684
2,0.592593,0.111111,8.49,3.167407
3,0.555556,0.296296,7.51,2.728889
4,0.739130,0.086957,6.77,2.684783
...,...,...,...,...
9040,0.740000,0.400000,14.49,2.835600
9041,1.000000,0.612903,9.05,2.853548
9042,0.677419,0.290323,9.33,2.933226
9043,0.906977,0.162791,6.47,1.487442


In [6]:
data_scaled = data
scaler = StandardScaler()
data_scaled[['perc_tackle_lt_1.5',
             'perc_tackl_end_facing',
             'End_dist_travel',
             'Mean_end_speed']] = scaler.fit_transform(data_features)

In [7]:
data_scaled

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,perc_tackle_lt_1.5,perc_tackl_end_facing,End_dist_travel,Mean_end_speed
0,202109090097,0.0,0,-0.237047,0.052963,1.452527,0.556893
1,202109090097,1.0,0,0.453498,-1.514970,0.905550,0.394273
2,202109120163,0.0,0,-0.165435,-1.266091,-0.226726,0.523841
3,202109120163,1.0,0,-0.359810,-0.477977,-0.568151,-0.082390
4,202109120288,0.0,0,0.603617,-1.368889,-0.825962,-0.143364
...,...,...,...,...,...,...,...
9040,20211025003684,1.0,0,0.608180,-0.036633,1.863630,0.065133
9041,20211025003904,0.0,0,1.972697,0.869444,-0.031627,0.089946
9042,20211025003904,1.0,0,0.279748,-0.503400,0.065923,0.200096
9043,20211025003926,0.0,0,1.484498,-1.046153,-0.930480,-1.798628


In [8]:
kmeans = KMeans(
    init="random",
    n_clusters=2,
    n_init=10,
    max_iter=300,
    random_state=1029)

In [9]:
kmeans.fit(data_scaled.iloc[:,-4:])

KMeans(init='random', n_clusters=2, random_state=1029)

In [10]:
data_scaled['Cluster'] = kmeans.labels_
data_scaled

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,perc_tackle_lt_1.5,perc_tackl_end_facing,End_dist_travel,Mean_end_speed,Cluster
0,202109090097,0.0,0,-0.237047,0.052963,1.452527,0.556893,0
1,202109090097,1.0,0,0.453498,-1.514970,0.905550,0.394273,0
2,202109120163,0.0,0,-0.165435,-1.266091,-0.226726,0.523841,0
3,202109120163,1.0,0,-0.359810,-0.477977,-0.568151,-0.082390,1
4,202109120288,0.0,0,0.603617,-1.368889,-0.825962,-0.143364,1
...,...,...,...,...,...,...,...,...
9040,20211025003684,1.0,0,0.608180,-0.036633,1.863630,0.065133,0
9041,20211025003904,0.0,0,1.972697,0.869444,-0.031627,0.089946,1
9042,20211025003904,1.0,0,0.279748,-0.503400,0.065923,0.200096,0
9043,20211025003926,0.0,0,1.484498,-1.046153,-0.930480,-1.798628,1


In [11]:
data_scaled['Cluster'].value_counts()

0    4727
1    4318
Name: Cluster, dtype: int64

In [12]:
data_scaled[data_scaled['uniqueplayId'] == 20211017063573]

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,perc_tackle_lt_1.5,perc_tackl_end_facing,End_dist_travel,Mean_end_speed,Cluster
7727,20211017063573,0.0,0,-0.505592,1.098252,0.999616,0.796235,0
7728,20211017063573,1.0,0,0.223317,-1.266091,2.393187,2.32653,0


In [13]:
test_plays = ['20210926112818','2021092602494','20211010112817','20211017071173','2021101010874','20211017024110','20211003081723','2021091901500','20211021002175','20210926051887']

In [16]:
results = data_scaled[data_scaled['uniqueplayId'].astype(str).isin(test_plays)]

In [19]:
actual = [1,0,1,0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,1,1]
results['actual'] = actual

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['actual'] = actual


In [20]:
results

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,perc_tackle_lt_1.5,perc_tackl_end_facing,End_dist_travel,Mean_end_speed,Cluster,actual
474,2021091901500,0.0,0,1.972697,1.098252,-1.564555,-1.247872,1,1
475,2021091901500,1.0,0,-1.088719,0.388949,-0.014207,1.387424,0,0
746,2021092602494,0.0,0,0.837964,1.02157,0.132118,-0.433939,1,1
747,2021092602494,1.0,0,0.837964,-1.623938,0.940389,0.44672,0,0
1483,2021101010874,0.0,0,0.223317,-0.320354,0.295863,-0.152664,0,1
1484,2021101010874,1.0,0,0.806443,-1.73896,1.358461,1.002066,0,0
4484,20210926051887,0.0,0,-1.844133,-0.578282,-0.885189,0.169873,0,1
4485,20210926051887,1.0,0,-0.651373,-0.19139,-1.177839,-0.267484,1,1
4953,20210926112818,0.0,0,-0.013086,-1.73896,0.933421,0.612987,0,0
4954,20210926112818,1.0,0,-0.722294,0.561482,0.424768,0.007324,0,1


In [21]:
plays = pd.read_csv('https://media.githubusercontent.com/media/cnickol26/BigDataBowl2023/main/nfl-big-data-bowl-2023/plays.csv')

In [22]:
# create unique play ID for each play
plays['uniqueplayId'] = plays['gameId'].astype(str) + plays['playId'].astype(str)

In [23]:
plays[plays['uniqueplayId'].isin(test_plays)]

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType,uniqueplayId
1329,2021091901,500,(6:30) (Shotgun) J.Burrow pass incomplete shor...,1,3,5,CIN,CHI,CIN,25,...,35.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",TRADITIONAL,0,Cover-6,Zone,2021091901500
2454,2021092602,494,(8:37) (Shotgun) T.Colon reported in as eligib...,1,1,10,BAL,DET,DET,49,...,61.0,PISTOL,"6 OL, 2 RB, 0 TE, 2 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,1,Quarters,Zone,2021092602494
2689,2021092605,1887,(1:44) (Shotgun) M.Jones pass short middle to ...,2,1,10,NE,NO,NE,25,...,35.0,EMPTY,"1 RB, 1 TE, 3 WR",5.0,"3 DL, 2 LB, 6 DB",TRADITIONAL,0,Cover-1,Man,20210926051887
3156,2021092611,2818,(7:23) (Shotgun) T.Brady pass short middle to ...,3,1,10,TB,LA,TB,25,...,85.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 4 LB, 5 DB",TRADITIONAL,0,Cover-6,Zone,20210926112818
4001,2021100308,1723,(5:05) (Shotgun) J.Hurts pass short left to M....,2,1,10,PHI,KC,KC,12,...,22.0,SHOTGUN,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Red Zone,Other,20211003081723
5266,2021101010,874,(2:50) (Shotgun) J.Fields pass short middle to...,1,1,10,CHI,LV,CHI,35,...,45.0,SHOTGUN,"1 RB, 3 TE, 1 WR",5.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone,2021101010874
5356,2021101011,2817,"(:34) (No Huddle, Shotgun) K.Murray pass short...",3,2,6,ARI,SF,ARI,40,...,70.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Quarters,Zone,20211010112817
5877,2021101702,4110,"(6:32) (No Huddle, Shotgun) S.Darnold pass sho...",4,1,10,CAR,MIN,MIN,30,...,40.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone,20211017024110
6196,2021101707,1173,(12:12) (Shotgun) T.Heinicke pass short middle...,2,1,10,WAS,KC,WAS,48,...,58.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,20211017071173
6638,2021102100,2175,(9:09) (Shotgun) T.Bridgewater pass short left...,3,2,8,DEN,CLE,CLE,17,...,93.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone,20211021002175


Actual rush type from watching the film (In order of test_plays) 
1) Speed Rush (Inside)
2) Power
3) Speed
4) Power
5) Power
6) Power (Inside)
7) Power
8) Speed
9) Power
10) Power (Cut Block)

In [24]:
## Speed is 0 and power is 1
## Correctly got 8/10 

In [25]:
plays[plays['uniqueplayId'] == '20211017063573']

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType,uniqueplayId
6154,2021101706,3573,"(8:40) (No Huddle, Shotgun) D.Jones pass incom...",4,1,10,NYG,LA,LA,42,...,68.0,SHOTGUN,"1 RB, 2 TE, 2 WR",5.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone,20211017063573
