In [921]:
import pandas as pd
import numpy as np
import pymc as pm
from sklearn.cluster import KMeans
import random
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture

# Data Processing for Clustering

In [922]:
flipped = pd.read_csv("https://github.com/cnickol26/BigDataBowl2023/blob/main/data_flipped.csv?raw=true")
flipped

  flipped = pd.read_csv("https://github.com/cnickol26/BigDataBowl2023/blob/main/data_flipped.csv?raw=true")


Unnamed: 0,uniqueplayId,football_x,football_y,gameId,playId,nflId,frameId,time,jerseyNumber,team,...,dropBackType,pff_playAction,ball_snap_frame,new_x,new_y,chip_side,end_frame,Left_end,Right_end,Flip
0,202109090097,41.56,23.92,2021090900,97,53441.0,6,2021-09-10 00:26:31,11.0,DAL,...,TRADITIONAL,0,6.0,2.12,1.99,none,40.0,53441.0,42403.0,1
1,202109090097,41.56,23.92,2021090900,97,53441.0,7,2021-09-10 00:26:31,11.0,DAL,...,TRADITIONAL,0,6.0,2.06,1.96,none,40.0,53441.0,42403.0,1
2,202109090097,41.56,23.92,2021090900,97,53441.0,8,2021-09-10 00:26:31,11.0,DAL,...,TRADITIONAL,0,6.0,1.96,1.91,none,40.0,53441.0,42403.0,1
3,202109090097,41.56,23.92,2021090900,97,53441.0,9,2021-09-10 00:26:31,11.0,DAL,...,TRADITIONAL,0,6.0,1.80,1.85,none,40.0,53441.0,42403.0,1
4,202109090097,41.56,23.92,2021090900,97,53441.0,10,2021-09-10 00:26:32,11.0,DAL,...,TRADITIONAL,0,6.0,1.56,1.72,none,40.0,53441.0,42403.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1373830,20211025003926,85.95,23.68,2021102500,3926,,45,2021-10-26 03:14:08,,football,...,TRADITIONAL,0,7.0,-7.18,-3.67,none,44.0,37097.0,46083.0,0
1373831,20211025003926,85.95,23.68,2021102500,3926,,46,2021-10-26 03:14:08,,football,...,TRADITIONAL,0,7.0,-7.22,-3.89,none,44.0,37097.0,46083.0,0
1373832,20211025003926,85.95,23.68,2021102500,3926,,47,2021-10-26 03:14:09,,football,...,TRADITIONAL,0,7.0,-7.27,-4.08,none,44.0,37097.0,46083.0,0
1373833,20211025003926,85.95,23.68,2021102500,3926,,48,2021-10-26 03:14:09,,football,...,TRADITIONAL,0,7.0,-7.31,-4.26,none,44.0,37097.0,46083.0,0


In [1059]:
flipped['event'].unique()

array(['ball_snap', 'None', 'autoevent_passforward', 'pass_forward',
       'play_action', 'pass_arrived', 'fumble',
       'fumble_offense_recovered', 'autoevent_ballsnap',
       'pass_outcome_caught', 'pass_outcome_incomplete', 'pass_tipped',
       'qb_sack', 'man_in_motion', 'line_set',
       'autoevent_passinterrupted', 'qb_strip_sack',
       'huddle_break_offense', 'first_contact', 'shift', 'handoff', 'run',
       'dropped_pass'], dtype=object)

In [1063]:
def play_end(df):
    new_df = df.sort_values('frameId')
    if 'pass_forward' in new_df['event']:
        return new_df[new_df['frameId'] <= new_df.loc[new_df['event'] == 'pass_forward', 'frameId']]
    elif 'run' in new_df['event']: 
        return new_df[new_df['frameId'] <= new_df.loc[new_df['event'] == 'run', 'frameId']]
    elif 'qb_sack' in new_df['event']: 
        return new_df[new_df['frameId'] <= new_df.loc[new_df['event'] == 'qb_sack', 'frameId']]
    elif 'qb_strip_sack' in new_df['event']: 
        return new_df[new_df['frameId'] <= new_df.loc[new_df['event'] == 'qb_strip_sack', 'frameId']]

flipped.groupby(['uniqueplayId', 'Flip']).apply(play_end)

In [923]:
frames_per_play = flipped[flipped['pff_positionLinedUp']=='QB'].groupby('uniqueplayId', as_index=False).size().rename(columns={'size':'play_length_frames'})

In [924]:
flipped = flipped.merge(frames_per_play, on='uniqueplayId', how='left')

In [925]:
flipped['play_length_frames'].min()

16.0

In [926]:
flipped.columns

Index(['uniqueplayId', 'football_x', 'football_y', 'gameId', 'playId', 'nflId',
       'frameId', 'time', 'jerseyNumber', 'team', 'playDirection', 'x', 'y',
       's', 'a', 'dis', 'o', 'dir', 'event', 'pff_role', 'pff_positionLinedUp',
       'pff_nflIdBlockedPlayer', 'pff_blockType', 'dropBackType',
       'pff_playAction', 'ball_snap_frame', 'new_x', 'new_y', 'chip_side',
       'end_frame', 'Left_end', 'Right_end', 'Flip', 'play_length_frames'],
      dtype='object')

In [927]:
def name_func(pos):
    temp = {'nflId':pos+'_nflId',
     'jerseyNumber':pos+'_jerseyNumber',
     'team':pos+'_team', 
     'x':pos+'_x', 
     'y':pos+'_y',
     's':pos+'_s', 
     'a':pos+'_a', 
     'dis':pos+'_dis', 
     'o':pos+'_o', 
     'dir':pos+'_dir', 
     'pff_role':pos+'_pff_role', 
     'pff_positionLinedUp':pos+'_pff_positionLinedUp',
     'pff_nflIdBlockedPlayer':pos+'_pff_nflIdBlockedPlayer', 
     'pff_blockType':pos+'_pff_blockType', 
     'new_x':pos+'_new_x', 
     'new_y':pos+'_new_y', 
     'Flip':pos+'_Flip'}
    return temp

In [928]:
qb = flipped[(flipped['pff_positionLinedUp']=='QB')]
qb = qb.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('QB')).reset_index(drop=True)

In [929]:
qb_flip = flipped[(flipped['pff_positionLinedUp']=='QB_flip')]
qb_flip = qb_flip.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('QB')).reset_index(drop=True)

In [930]:
ball = flipped[(flipped['pff_positionLinedUp']=='ball')]
ball = ball.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('ball')).reset_index(drop=True)

In [931]:
ball_flip = flipped[(flipped['pff_positionLinedUp']=='ball_flip')]
ball_flip = ball_flip.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('ball')).reset_index(drop=True)


In [932]:
rt = flipped[(flipped['pff_positionLinedUp']=='RT')]
rt = rt.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('Tackle')).reset_index(drop=True)

In [933]:
lt = flipped[(flipped['pff_positionLinedUp']=='LT')]
lt = lt.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('Tackle')).reset_index(drop=True)

In [934]:
le = flipped[((flipped['pff_role']=='Pass Rush')|(flipped['pff_role']=='Coverage'))&
             (flipped['Flip']==1)]
le = le.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('End')).reset_index(drop=True)

In [935]:
re = flipped[((flipped['pff_role']=='Pass Rush')|(flipped['pff_role']=='Coverage'))&
             (flipped['Flip']==0)]
re = re.drop(['football_x', 'football_y', 'gameId', 'playId',
       'time', 'playDirection', 'event', 'ball_snap_frame', 'end_frame', 
       'Right_end', 'Left_end', 'dropBackType', 'pff_playAction', 'play_length_frames',
       'chip_side'], axis=1).rename(columns=name_func('End')).reset_index(drop=True)

In [936]:
play = flipped[(flipped['pff_positionLinedUp']=='QB')][['uniqueplayId', 'football_x', 'football_y', 
                                                        'gameId', 'playId', 'frameId', 'time', 'playDirection', 
                                                        'event', 'ball_snap_frame', 'end_frame', 'play_length_frames',
                                                        'Right_end', 'Left_end']].reset_index(drop=True)

In [937]:
final_left = play.merge(qb, on=['uniqueplayId', 'frameId'], how='left').merge(ball, on=['uniqueplayId', 'frameId'], how='left')

In [938]:
final_left = final_left.merge(lt, on=['uniqueplayId', 'frameId'], how='left').merge(re, on=['uniqueplayId', 'frameId'], how='left')

In [939]:
final_right = play.merge(qb_flip, on=['uniqueplayId', 'frameId'], how='left').merge(ball_flip, on=['uniqueplayId', 'frameId'], how='left')

In [940]:
final_right = final_right.merge(rt, on=['uniqueplayId', 'frameId'], how='left').merge(le, on=['uniqueplayId', 'frameId'], how='left')

In [941]:
final = pd.concat([final_left, final_right])

In [942]:
play_ex = final[final['uniqueplayId']==20210919023392]

Before classifying:<br>
> - Remove frames before ball is snapped and after it is thrown<br>
> - Remove tackle and end where the end's pff_role is Coverage<br>
> - First check: Remove plays where end moves into the inside (stunt)<br>
> - Remove plays where the distance between end and tackle is large directly after ball snap (not against each other)<br>
<br>

### Classifying
##### Clustering into two groups<br>
Predictor variables:<br>
> - Percentage that tackle/end are looking at each other <br>
> - Percentage that Euclidian distance is less than 0.5 (0.75)<br>
> - Total distance traveled of defensive end<br>
> - Max speed during rush<br>

### Create vars for clustering

In [943]:
def dist(x_1, y_1, x_2, y_2):
    return np.sqrt(np.sum([(x_1-x_2)**2, (y_1-y_2)**2], axis=0))

In [944]:
final['tackle_end_dist'] = dist(final['Tackle_new_x'], final['Tackle_new_y'], final['End_new_x'], final['End_new_y'])

In [945]:
#angle_range = 20
def check_angle(x, y):
    diff = abs(abs(x-y)-180)
    #if diff <= angle_range:
        #return 1
    #else:
        #return 0
    return diff

In [946]:
final['tackle_end_facing'] = final.apply(lambda row: check_angle(row['Tackle_o'], row['End_o']),  axis=1)

In [947]:
final

Unnamed: 0,uniqueplayId,football_x,football_y,gameId,playId,frameId,time,playDirection,event,ball_snap_frame,...,End_dir,End_pff_role,End_pff_positionLinedUp,End_pff_nflIdBlockedPlayer,End_pff_blockType,End_new_x,End_new_y,End_Flip,tackle_end_dist,tackle_end_facing
0,202109090097,41.56,23.92,2021090900,97,6,2021-09-10 00:26:31,right,ball_snap,6.0,...,247.75,Pass Rush,ROLB,,,2.34,8.71,0.0,6.241354,46.42
1,202109090097,41.56,23.92,2021090900,97,7,2021-09-10 00:26:31,right,,6.0,...,252.79,Pass Rush,ROLB,,,2.27,8.70,0.0,6.217025,46.23
2,202109090097,41.56,23.92,2021090900,97,8,2021-09-10 00:26:31,right,,6.0,...,248.14,Pass Rush,ROLB,,,2.12,8.62,0.0,6.085433,45.79
3,202109090097,41.56,23.92,2021090900,97,9,2021-09-10 00:26:31,right,,6.0,...,246.73,Pass Rush,ROLB,,,1.93,8.54,0.0,5.926635,51.35
4,202109090097,41.56,23.92,2021090900,97,10,2021-09-10 00:26:32,right,,6.0,...,243.55,Pass Rush,ROLB,,,1.55,8.32,0.0,5.577966,57.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166320,20211025003926,85.95,23.68,2021102500,3926,45,2021-10-26 03:14:08,left,,7.0,...,26.59,Pass Rush,LOLB,,,-4.83,3.77,1.0,1.314002,35.51
166321,20211025003926,85.95,23.68,2021102500,3926,46,2021-10-26 03:14:08,left,,7.0,...,23.83,Pass Rush,LOLB,,,-4.91,3.94,1.0,1.338096,46.50
166322,20211025003926,85.95,23.68,2021102500,3926,47,2021-10-26 03:14:09,left,,7.0,...,21.57,Pass Rush,LOLB,,,-4.98,4.11,1.0,1.362424,48.72
166323,20211025003926,85.95,23.68,2021102500,3926,48,2021-10-26 03:14:09,left,,7.0,...,18.94,Pass Rush,LOLB,,,-5.05,4.28,1.0,1.373062,48.08


In [948]:
final['end_too_far'] = (final['frameId']>final['ball_snap_frame']+10) &(final['tackle_end_dist']>4)

In [949]:
def full_agg(grouped_df):
    return pd.DataFrame({
    'end_too_far': [1 if grouped_df['end_too_far'].sum()>=1 else 0],
    'Mean_tackle_end_dist': [grouped_df[grouped_df['frameId']>grouped_df['ball_snap_frame']+10]['tackle_end_dist'].mean()],
    'Mean_tackle_end_facing': [grouped_df['tackle_end_facing'].mean()],
    'End_dist_travel': [grouped_df['End_dis'].sum()],
    'Mean_end_speed': [grouped_df['End_s'].mean()]
})

cluster_data = final.groupby(['uniqueplayId', 'QB_Flip'], group_keys=True).apply(full_agg).reset_index().drop(['level_2'], axis=1)

In [950]:
cluster_data

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed
0,202109090097,0.0,0,1.124898,41.940789,13.31,3.191316
1,202109090097,1.0,0,0.928589,73.850263,11.74,3.073684
2,202109120163,0.0,0,0.881869,31.188148,8.49,3.167407
3,202109120163,1.0,0,0.758866,41.957407,7.51,2.728889
4,202109120288,0.0,0,1.174814,114.386087,6.77,2.684783
...,...,...,...,...,...,...,...
9602,20211025003684,1.0,0,1.156339,65.342400,14.49,2.835600
9603,20211025003904,0.0,0,0.660331,22.898710,9.05,2.853548
9604,20211025003904,1.0,0,0.882441,37.075484,9.33,2.933226
9605,20211025003926,0.0,0,0.823808,37.409535,6.47,1.487442


In [951]:
# cluster_data = final.groupby(['uniqueplayId', 'QB_Flip'], as_index=False).agg({
#     'end_too_far': lambda x: 1 if x.sum()>=1 else 0,
#     'tackle_end_dist': 'mean',
#     'tackle_end_facing': 'mean',
#     'End_dis': 'sum',
#     'End_s':'mean'
# }).rename(columns={
#      'tackle_end_dist':'Mean_tackle_end_dist',
#      'tackle_end_facing':'Mean_tackle_end_facing',
#      'End_dis':'End_dist_travel',
#      'End_s':'Mean_end_speed'
#  })
# cluster_data

In [952]:
power = final[(final['uniqueplayId']==2021091902389)&(final['QB_Flip']==0)]
speed = final[(final['uniqueplayId']==20210919023392)&(final['QB_Flip']==0)]
power_clust = cluster_data[(cluster_data['uniqueplayId']==2021091902389)&(cluster_data['QB_Flip']==0)]
speed_clust = cluster_data[(cluster_data['uniqueplayId']==20210919023392)&(cluster_data['QB_Flip']==0)]

In [953]:
power_clust

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed
517,2021091902389,0.0,0,0.813461,13.812619,9.65,2.212143


In [954]:
speed_clust

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed
3547,20210919023392,0.0,0,0.576881,79.314516,12.76,4.015161


In [955]:
cluster_final = cluster_data[cluster_data['end_too_far']==0]

# K-Means Clustering

In [1020]:
test_plays = [20210926112818,2021092602494,20211010112817,20211017071173,2021101010874,20211017024110, 20211003081723, 2021091901500, 20211021002175, 20210926051887]

cluster_final_test = cluster_final[cluster_final['uniqueplayId'].isin(test_plays)]

cluster_final_train = cluster_final[~cluster_final['uniqueplayId'].isin(test_plays)]

cluster_X = cluster_final_train.drop(
    ['uniqueplayId', 'QB_Flip', 'end_too_far'], axis=1)

scaler = StandardScaler().fit(cluster_X)
cluster_X_scale_train = scaler.transform(cluster_X)
cluster_X_scale_test = scaler.transform(cluster_final_test.drop(
    ['uniqueplayId', 'QB_Flip', 'end_too_far'], axis=1))

In [1021]:
kmeans = KMeans(n_clusters=2, random_state=2023).fit(cluster_X_scale_train)

In [1022]:
cluster_final_test['Cluster'] = kmeans.predict(cluster_X_scale_test)
cluster_final_test['actual'] = [(i-1)*-1 for i in [1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1,1,1]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_final_test['Cluster'] = kmeans.predict(cluster_X_scale_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_final_test['actual'] = [(i-1)*-1 for i in [1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1,1,1]]


In [1023]:
cluster_final_test

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,Cluster,actual
499,2021091901500,0.0,0,1.126498,19.039167,4.65,1.885833,0,0
500,2021091901500,1.0,0,1.138747,43.272917,9.1,3.792083,1,1
794,2021092602494,0.0,0,0.963851,14.988919,9.52,2.474595,0,0
795,2021092602494,1.0,0,1.290382,80.818378,11.84,3.111622,1,1
1576,2021101010874,0.0,0,0.878565,30.746389,9.99,2.678056,0,0
1577,2021101010874,1.0,0,0.969142,81.063056,13.04,3.513333,1,1
4780,20210926051887,1.0,0,1.041918,22.306818,5.76,2.595,0,0
5286,20210926112818,0.0,0,0.925383,79.243514,11.82,3.231892,1,1
5287,20210926112818,1.0,0,1.345928,49.224865,10.36,2.793784,1,0
6121,20211003081723,0.0,0,1.191883,16.275517,8.66,2.869655,0,1


In [1024]:
# Accuracy
sum(cluster_final_test['actual']==cluster_final_test['Cluster'])/len(cluster_final_test)

0.7894736842105263

In [1025]:
labs = kmeans.labels_
cluster_final_train['rushType'] = labs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_final_train['rushType'] = labs


In [1026]:
cluster_final_train[cluster_final_train['uniqueplayId']==20211017063573]

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,rushType
8208,20211017063573,0.0,0,1.28506,16.827778,12.01,3.364444,1
8209,20211017063573,1.0,0,1.034063,67.598611,16.01,4.471389,1


In [1027]:
kmeans.cluster_centers_

array([[-0.03374411, -0.36384232, -0.50623255, -0.46356002],
       [ 0.06327697,  0.68227731,  0.94928754,  0.86926799]])

# Gaussian Mixture Model

In [1028]:
gm = GaussianMixture(n_components=2, random_state=6030).fit(cluster_X)

In [1029]:
gmm_preds = gm.predict(cluster_final_test.drop(
    ['uniqueplayId', 'QB_Flip', 'end_too_far', 'actual', 'Cluster'], axis=1))
gmm_preds = [(i-1)*-1 for i in gmm_preds]
cluster_final_test['GMM_Pred'] = kmeans.predict(cluster_X_scale_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_final_test['GMM_Pred'] = kmeans.predict(cluster_X_scale_test)


In [1030]:
cluster_final_test

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,Cluster,actual,GMM_Pred
499,2021091901500,0.0,0,1.126498,19.039167,4.65,1.885833,0,0,0
500,2021091901500,1.0,0,1.138747,43.272917,9.1,3.792083,1,1,1
794,2021092602494,0.0,0,0.963851,14.988919,9.52,2.474595,0,0,0
795,2021092602494,1.0,0,1.290382,80.818378,11.84,3.111622,1,1,1
1576,2021101010874,0.0,0,0.878565,30.746389,9.99,2.678056,0,0,0
1577,2021101010874,1.0,0,0.969142,81.063056,13.04,3.513333,1,1,1
4780,20210926051887,1.0,0,1.041918,22.306818,5.76,2.595,0,0,0
5286,20210926112818,0.0,0,0.925383,79.243514,11.82,3.231892,1,1,1
5287,20210926112818,1.0,0,1.345928,49.224865,10.36,2.793784,1,0,1
6121,20211003081723,0.0,0,1.191883,16.275517,8.66,2.869655,0,1,0


In [1031]:
# Accuracy
sum(cluster_final_test['actual']==cluster_final_test['GMM_Pred'])/len(cluster_final_test)

0.7894736842105263

In [1032]:
gm.means_

array([[ 1.33259235, 59.32382883, 10.31641205,  2.89915102],
       [ 0.99264928, 30.14496133,  8.54741777,  2.73132662]])

In [1033]:
gmm_pred_prob = gm.predict_proba(cluster_final_test.drop(
    ['uniqueplayId', 'QB_Flip', 'end_too_far', 'actual', 'Cluster', 'GMM_Pred'], axis=1))
gmm_pred_prob = pd.DataFrame(gmm_pred_prob).round(5)
gmm_pred_prob.columns = ['Speed_prob', 'Power_prob']

In [1034]:
pd.concat([cluster_final_test, gmm_pred_prob.set_index(cluster_final_test.index)], axis=1)

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,Cluster,actual,GMM_Pred,Speed_prob,Power_prob
499,2021091901500,0.0,0,1.126498,19.039167,4.65,1.885833,0,0,0,0.02279,0.97721
500,2021091901500,1.0,0,1.138747,43.272917,9.1,3.792083,1,1,1,0.10163,0.89837
794,2021092602494,0.0,0,0.963851,14.988919,9.52,2.474595,0,0,0,0.01941,0.98059
795,2021092602494,1.0,0,1.290382,80.818378,11.84,3.111622,1,1,1,0.99632,0.00368
1576,2021101010874,0.0,0,0.878565,30.746389,9.99,2.678056,0,0,0,0.01902,0.98098
1577,2021101010874,1.0,0,0.969142,81.063056,13.04,3.513333,1,1,1,0.98789,0.01211
4780,20210926051887,1.0,0,1.041918,22.306818,5.76,2.595,0,0,0,0.01714,0.98286
5286,20210926112818,0.0,0,0.925383,79.243514,11.82,3.231892,1,1,1,0.97033,0.02967
5287,20210926112818,1.0,0,1.345928,49.224865,10.36,2.793784,1,0,1,0.51647,0.48353
6121,20211003081723,0.0,0,1.191883,16.275517,8.66,2.869655,0,1,0,0.02382,0.97618


In [1035]:
cluster_final_train['GMM_rushType'] = (gm.predict(cluster_X)-1)*-1
gmm_pred_prob_train = gm.predict_proba(cluster_final_train.drop(
    ['uniqueplayId', 'QB_Flip', 'end_too_far', 'rushType', 'GMM_rushType'], axis=1))
gmm_pred_prob_train = pd.DataFrame(gmm_pred_prob_train).round(5)
gmm_pred_prob_train.columns = ['Speed_prob', 'Power_prob']
cluster_final_train = pd.concat([cluster_final_train, gmm_pred_prob_train.set_index(cluster_final_train.index)], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_final_train['GMM_rushType'] = (gm.predict(cluster_X)-1)*-1


In [1036]:
cluster_final_train

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,rushType,GMM_rushType,Speed_prob,Power_prob
0,202109090097,0.0,0,1.124898,41.940789,13.31,3.191316,1,0,0.47445,0.52555
1,202109090097,1.0,0,0.928589,73.850263,11.74,3.073684,1,1,0.91533,0.08467
2,202109120163,0.0,0,0.881869,31.188148,8.49,3.167407,0,0,0.01299,0.98701
3,202109120163,1.0,0,0.758866,41.957407,7.51,2.728889,0,0,0.03215,0.96785
4,202109120288,0.0,0,1.174814,114.386087,6.77,2.684783,1,1,1.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...
9602,20211025003684,1.0,0,1.156339,65.342400,14.49,2.835600,1,1,0.99702,0.00298
9603,20211025003904,0.0,0,0.660331,22.898710,9.05,2.853548,0,0,0.01014,0.98986
9604,20211025003904,1.0,0,0.882441,37.075484,9.33,2.933226,0,0,0.01781,0.98219
9605,20211025003926,0.0,0,0.823808,37.409535,6.47,1.487442,0,0,0.02779,0.97221


In [1037]:
sum(cluster_final_train['GMM_rushType']!=cluster_final_train['rushType'])

2462

In [1038]:
cluster_filt = cluster_final_train[(cluster_final_train['Power_prob']>.55)|(cluster_final_train['Power_prob']<.45)]

In [1039]:
cluster_filt

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,rushType,GMM_rushType,Speed_prob,Power_prob
1,202109090097,1.0,0,0.928589,73.850263,11.74,3.073684,1,1,0.91533,0.08467
2,202109120163,0.0,0,0.881869,31.188148,8.49,3.167407,0,0,0.01299,0.98701
3,202109120163,1.0,0,0.758866,41.957407,7.51,2.728889,0,0,0.03215,0.96785
4,202109120288,0.0,0,1.174814,114.386087,6.77,2.684783,1,1,1.00000,0.00000
5,202109120288,1.0,0,1.136028,35.697826,3.28,1.351739,0,0,0.10940,0.89060
...,...,...,...,...,...,...,...,...,...,...,...
9602,20211025003684,1.0,0,1.156339,65.342400,14.49,2.835600,1,1,0.99702,0.00298
9603,20211025003904,0.0,0,0.660331,22.898710,9.05,2.853548,0,0,0.01014,0.98986
9604,20211025003904,1.0,0,0.882441,37.075484,9.33,2.933226,0,0,0.01781,0.98219
9605,20211025003926,0.0,0,0.823808,37.409535,6.47,1.487442,0,0,0.02779,0.97221


In [1040]:
cluster_filt = cluster_filt.drop('rushType', axis=1).rename(columns={'GMM_rushType':'rushType'})

In [1041]:
cluster_filt['rushType'] = cluster_filt['rushType'].map({0:'Power', 1:'Speed'})

In [1042]:
cluster_filt

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,rushType,Speed_prob,Power_prob
1,202109090097,1.0,0,0.928589,73.850263,11.74,3.073684,Speed,0.91533,0.08467
2,202109120163,0.0,0,0.881869,31.188148,8.49,3.167407,Power,0.01299,0.98701
3,202109120163,1.0,0,0.758866,41.957407,7.51,2.728889,Power,0.03215,0.96785
4,202109120288,0.0,0,1.174814,114.386087,6.77,2.684783,Speed,1.00000,0.00000
5,202109120288,1.0,0,1.136028,35.697826,3.28,1.351739,Power,0.10940,0.89060
...,...,...,...,...,...,...,...,...,...,...
9602,20211025003684,1.0,0,1.156339,65.342400,14.49,2.835600,Speed,0.99702,0.00298
9603,20211025003904,0.0,0,0.660331,22.898710,9.05,2.853548,Power,0.01014,0.98986
9604,20211025003904,1.0,0,0.882441,37.075484,9.33,2.933226,Power,0.01781,0.98219
9605,20211025003926,0.0,0,0.823808,37.409535,6.47,1.487442,Power,0.02779,0.97221


In [1043]:
cluster_final_test = cluster_final_test.drop(['Cluster', 'GMM_Pred'], axis=1).rename(columns={'actual':'rushType'})
cluster_final_test['rushType'] = cluster_final_test['rushType'].map({0:'Power', 1:'Speed'})

In [1044]:
cluster_final_test

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,rushType
499,2021091901500,0.0,0,1.126498,19.039167,4.65,1.885833,Power
500,2021091901500,1.0,0,1.138747,43.272917,9.1,3.792083,Speed
794,2021092602494,0.0,0,0.963851,14.988919,9.52,2.474595,Power
795,2021092602494,1.0,0,1.290382,80.818378,11.84,3.111622,Speed
1576,2021101010874,0.0,0,0.878565,30.746389,9.99,2.678056,Power
1577,2021101010874,1.0,0,0.969142,81.063056,13.04,3.513333,Speed
4780,20210926051887,1.0,0,1.041918,22.306818,5.76,2.595,Power
5286,20210926112818,0.0,0,0.925383,79.243514,11.82,3.231892,Speed
5287,20210926112818,1.0,0,1.345928,49.224865,10.36,2.793784,Power
6121,20211003081723,0.0,0,1.191883,16.275517,8.66,2.869655,Speed


In [1045]:
cluster_filt = pd.concat([cluster_filt, cluster_final_test], axis=0)

In [1046]:
cluster_filt

Unnamed: 0,uniqueplayId,QB_Flip,end_too_far,Mean_tackle_end_dist,Mean_tackle_end_facing,End_dist_travel,Mean_end_speed,rushType,Speed_prob,Power_prob
1,202109090097,1.0,0,0.928589,73.850263,11.74,3.073684,Speed,0.91533,0.08467
2,202109120163,0.0,0,0.881869,31.188148,8.49,3.167407,Power,0.01299,0.98701
3,202109120163,1.0,0,0.758866,41.957407,7.51,2.728889,Power,0.03215,0.96785
4,202109120288,0.0,0,1.174814,114.386087,6.77,2.684783,Speed,1.00000,0.00000
5,202109120288,1.0,0,1.136028,35.697826,3.28,1.351739,Power,0.10940,0.89060
...,...,...,...,...,...,...,...,...,...,...
7944,20211017024110,1.0,0,0.653314,26.291212,9.32,2.681818,Power,,
8238,20211017071173,0.0,0,2.032436,23.281200,6.30,2.558400,Speed,,
8239,20211017071173,1.0,0,1.311321,24.206400,3.93,1.528400,Power,,
8697,20211021002175,0.0,0,1.260126,22.765217,6.12,2.606957,Power,,


In [1047]:
cluster_filt = cluster_filt[['uniqueplayId', 'QB_Flip', 'rushType', 'Power_prob', 'Speed_prob']].merge(
    final.drop('end_too_far', axis=1), how='left', on=['uniqueplayId', 'QB_Flip']
)

In [1048]:
# cluster_filt.to_csv('clustered.csv', index = False)