In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

 
# Read dataset
df = pd.read_csv("Police_Use_of_Force_onehot.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,X,Y,PoliceUseOfForceID,CaseNumber,ResponseDate,ForceReportNumber,SubjectRoleNumber,ForceType,EventAge,...,Precinct_2,Precinct_3,Precinct_4,Precinct_5,Neighborhood_m_Downtown West,Neighborhood_m_Hawthorne,Neighborhood_m_Jordan,Neighborhood_m_Near - North,Neighborhood_m_Other,Neighborhood_m_Willard - Hay
0,1,-93.286438,45.010118,15631493,08-018973,2008/01/19 22:53:35+00,1,2,Bodily Force,16.0,...,0,0,1,0,0,1,0,0,0,0
1,2,-93.266112,44.974295,15631494,08-019237,2008/01/20 03:47:57+00,1,1,Bodily Force,40.0,...,0,0,0,0,1,0,0,0,0,0
2,3,-93.271963,44.981655,15631495,08-031728,2008/02/02 01:33:16+00,1,1,Chemical Irritant,23.0,...,0,0,0,0,1,0,0,0,0,0
3,4,-93.269013,44.982137,15631496,08-032796,2008/02/03 00:50:46+00,1,1,Bodily Force,23.0,...,0,0,0,0,1,0,0,0,0,0
4,5,-93.24865,44.952851,15631497,08-043457,2008/02/14 19:53:39+00,1,1,Bodily Force,34.0,...,0,1,0,0,0,0,0,0,1,0


In [2]:
train_proportion = 0.8
n = df.shape[0]
print("Size of dataset: ", n)

ntrain = int(train_proportion * n)
print(ntrain)

target = df.filter(items=['ForceType'])

f = [col for col in df if col != "ForceType" and col != "X" and col != "Y" and col != "PoliceUseOfForceID" 
            and col != "CaseNumber" and col != "ResponseDate"  and col != "DateAdded"  and col != "OBJECTID"
            and col != "ForceTypeAction_m_Body Weight to Pin" and col != "ForceTypeAction_m_Crowd Control Mace"
            and col != "ForceTypeAction_m_Firing Darts" and col != "ForceTypeAction_m_Joint Lock"
            and col != "ForceTypeAction_m_Kicks" and col != "ForceTypeAction_m_Knees"
            and col != "ForceTypeAction_m_Other" and col != "ForceTypeAction_m_Personal Mace" 
            and col != "ForceTypeAction_m_Punches" and col != "ForceTypeAction_m_Push Away" 
            and col != "ForceTypeAction_m_Takedown" and col != "Column1"]
data = df[f]

# the following variable records the features of examples in the training set
train_x = data.loc[0:ntrain,:]
# the following variable records the features of examples in the test set
test_x = (data.loc[ntrain+1:n,:]).reset_index()
force_types = ["Bodily Force", "Chemical Irritant", "Taser", "Police K9 Bite", "Firearm", "Baton", "Improvised Weapon", 
    "Gun Point Display", "Maximal Restraint Technique", "Less Lethal", "Less Lethal Projectile"]
y = []
for i in range(n):
    y.append(force_types.index(target.iloc[i,0]))
    
# the following variable records the labels of examples in the training set
train_y = y[0:ntrain+1]
# the following variable records the labels of examples in the test set
test_y = y[ntrain+1:n]

Size of dataset:  29251
23400


In [3]:
d = train_x.shape[1]
d2 = test_x.shape[1]

labels_string = ["EventAge"]
train_vals_from_string = train_x.filter(items=labels_string).applymap(lambda x: float(x) if not np.isnan(x) else 0.0)
train_x_no_string = pd.concat([train_x.iloc[:,1:3], train_vals_from_string, train_x.iloc[:, 4:d]], axis=1)
X_train = pd.concat([train_x_no_string, pd.DataFrame(np.ones(train_x.shape[0]))], axis=1)

test_vals_from_string = test_x.filter(items=labels_string).applymap(lambda x: float(x) if not np.isnan(x) else 0.0)
test_x_no_string = pd.concat([test_x.iloc[:,2:4], test_vals_from_string, test_x.iloc[:, 5:d2]], axis=1)
X_test = pd.concat([test_x_no_string, pd.DataFrame(np.ones(test_x.shape[0]))], axis=1)

In [4]:
for col in X_train.columns:
    print(X_train[col].isnull().values.any())

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [5]:
d = data.shape[1]

vals_from_string = data.filter(items=labels_string).applymap(lambda x: float(x) if not np.isnan(x) else 0.0)
x_no_string = pd.concat([data.iloc[:,1:3], vals_from_string, data.iloc[:, 4:d]], axis=1)
Xoffset =  pd.concat([x_no_string, pd.DataFrame(np.ones(n))], axis=1)

In [6]:
strategy = {0:17849, 1:4800, 2:4800, 3:4800, 4:4800, 5:4800, 6:4800, 7:4800, 8:4800, 9:4800, 10:4800}
oversample = SMOTE(sampling_strategy=strategy)
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, train_y)
print(y_train_SMOTE)
# m = length(train_y)
# ova_loss = OvALoss(11, bin_loss = LogisticLoss())
# mul!(ova_loss, 1/m)
# w_ova = proxgrad(ova_loss, ZeroReg(), X_train, train_y, stepsize=1/norm(X_train)^2, maxiters=200)

  n_samples_majority,


[0, 0, 1, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 3, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 1, 0, 1, 3, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 2, 0, 0, 2, 2, 1, 1, 0, 2, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 2, 0, 3, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 6, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 6, 0, 0, 1, 0, 2, 0, 0, 0, 0, 2, 2, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 5, 0, 1, 2, 1, 2, 0, 2, 0, 0, 0, 2, 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 

In [6]:
num_labels = Counter(train_y)
print(num_labels)

over = SMOTE(sampling_strategy={1:num_labels[1]+480, 2:num_labels[2]+480, 3:num_labels[3]+480, 4:num_labels[4]+480, 5:num_labels[5]+480, 6:num_labels[6]+480, 7:num_labels[7]+480, 8:num_labels[8]+480, 9:num_labels[9]+480, 10:num_labels[10]+480})
under = RandomUnderSampler(sampling_strategy={0:num_labels[0]-100})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_SMOTE, y_SMOTE = pipeline.fit_resample(X_train, train_y)

Counter({0: 16849, 1: 3258, 2: 2228, 7: 357, 6: 272, 3: 238, 8: 88, 5: 49, 4: 30, 9: 19, 10: 13})


In [84]:
clf = OneVsRestClassifier(LogisticRegression(max_iter=10000, penalty='none', solver='saga')).fit(X_SMOTE, y_SMOTE)

In [89]:
rf = RandomForestClassifier(n_estimators=1000).fit(X_SMOTE, y_SMOTE)

In [85]:
x = clf.predict(X_test)

In [90]:
x_rf = rf.predict(X_test)

In [86]:
for i in range(len(x)):
    print(x[i], test_y[i])

0 1
0 0
0 1
0 0
0 0
0 0
0 0
0 1
0 0
0 2
0 0
0 0
0 0
0 0
0 1
0 0
9 0
0 0
9 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 2
0 1
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 1
0 2
0 9
0 0
0 2
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 8
0 0
0 0
0 0
0 0
0 0
0 6
0 0
0 0
0 0
0 1
0 2
0 0
0 0
0 0
0 0
0 2
0 0
0 2
0 1
0 0
9 0
0 0
0 1
0 0
0 2
0 6
0 0
0 6
0 0
0 1
0 1
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 1
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 1
0 7
0 0
0 0
0 0
0 0
0 0
0 0
0 8
0 0
0 0
0 0
0 0
0 0
9 1
0 0
0 0
0 0
0 0
0 0
0 0
9 9
9 0
0 1
0 0
0 2
0 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 2
0 0
0 0
0 0
0 0
0 1
9 2
0 0
9 0
0 0
0 1
0 0
0 1
0 2
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 1
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 8
0 0
0 1
0 0
0 0
9 0
0 0
0 0
0 1
0 2
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 1
0 0
0 0
0 0
0 9
0 0
0 0
9 9
0 9
0 0
0 0
0 0
0 2
9 9
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
9 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 1
0 2
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 1


0 0
0 0
9 1
9 1
0 0
9 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 1
0 0
0 0
0 0
0 0
0 0
0 3
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 2
0 2
0 0
0 0
0 0
9 0
0 3
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 1
0 0
0 0
0 6
0 0
0 0
0 0
0 1
0 0
0 0
0 1
0 1
0 0
0 0
0 0
0 1
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
9 0
0 0
0 0
0 0
0 0
9 1
0 0
0 0
0 2
0 0
0 1
0 2
0 2
0 2
0 10
0 0
0 2
0 0
0 0
0 0
0 2
0 0
0 0
0 1
0 0
0 0
0 1
0 0
0 0
0 0
0 2
0 0
0 1
0 0
0 0
0 0
0 0
0 1
0 2
0 0
0 8
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 7
0 0
0 1
0 2
0 0
0 3
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 2
0 2
0 0
0 1
0 0
0 0
0 0
0 0
0 6
0 0
0 0
0 0
0 2
0 0
0 0
0 1
0 0
0 2
0 1
0 0
0 2
0 6
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 2
0 0
0 2
0 0
0 1
0 0
0 8
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 8
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1

0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 2
0 0
0 0
0 2
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 2
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 8
0 0
0 0
0 0
0 0
0 0
0 8
0 2
0 1
0 0
0 0
0 1
0 0
0 1
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 1
0 1
0 0
0 0
0 0
0 2
0 0
0 2
0 2
0 0
0 0
0 8
0 0
0 0
0 2
0 0
0 1
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 1
9 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 3
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
9 9
0 0
0 2
0 1
0 0
0 0
0 3
0 0
0 0
0 8
0 0
9 0
0 0
9 0
0 1
0 0
0 0
0 2
0 0
0 0
0 0
0 0
9 0
0 1
0 0
9 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
9 0
0 0
0 0
0 3
0 0
0 0
0 0
0 0
0 0
0 1
9 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
9 1
0 1
0 0
9 2
0 2
9 6
0 1
0 2
0 6
0 2
0 0
0 0
0 0
9 0
9 0
9 1
0 0
0 0
9 0
0 0
0 0
0 2
9 0
0 0
0 0
0 0
0 0
0 0
0 7
0 0
0 0
0 0
0 0
0 0
0 0
0 7
0 0
0 0
0 0
0 0
0 0
9 2
0 0
0 0


In [87]:
# MSE
sum = 0
for i in range(len(x)):
    sum += (x[i]-test_y[i])**2
print(sum/len(x))

# misclassification
sum = 0
for i in range(len(x)):
    if x[i] != test_y[i]:
        sum +=1
print(len(x) - sum)       
print(sum/len(x))

3.9613675213675212
4185
0.2846153846153846


In [91]:
# MSE
sum = 0
for i in range(len(x_rf)):
    sum += (x_rf[i]-test_y[i])**2
print(sum/len(x_rf))

# misclassification
sum = 0
for i in range(len(x_rf)):
    if x_rf[i] != test_y[i]:
        sum +=1
print(len(x_rf) - sum)       
print(sum/len(x_rf))

3.2112820512820512
4572
0.21846153846153846


In [79]:
ebm = ExplainableBoostingClassifier()
ebm.fit(X_SMOTE, y_SMOTE)


Multiclass is still experimental. Subject to change per release.



KeyboardInterrupt: 

In [46]:
ebm_local = ebm.explain_local(X_test, test_y)
show(ebm_local)

In [50]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [65]:
pred = ebm.predict(X_test)

In [70]:
for i in range(len(pred)):
    if pred[i] == 9:
        print(i)

79
132
211
217
582
658
659
1106
2473
2566
3336
3432
3537
3737
3830
4004
4166
5794


In [74]:
probs = ebm.predict_proba(X_test)
print(max(probs[79]))

0.658225926856028


In [75]:
# misclassification
sum = 0
for i in range(len(pred)):
    if pred[i] != test_y[i]:
        sum +=1
print(len(pred) - sum)       
print(sum/len(pred))

4681
0.19982905982905982


# EBM Cross Validation

In [9]:
# misclassification
def misclassification(pred, test_y):
    sum = 0
    for i in range(len(pred)):
        if pred[i] != test_y[i]:
            sum +=1  
    return sum/len(pred)

In [10]:
from sklearn.model_selection import train_test_split

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(Xoffset, y, test_size=0.2, random_state=1)
    num_labels = Counter(y_train)
    over = SMOTE(sampling_strategy={1:num_labels[1]+480, 2:num_labels[2]+480, 3:num_labels[3]+480, 4:num_labels[4]+480, 5:num_labels[5]+480, 6:num_labels[6]+480, 7:num_labels[7]+480, 8:num_labels[8]+480, 9:num_labels[9]+480, 10:num_labels[10]+480})
    under = RandomUnderSampler(sampling_strategy={0:num_labels[0]-100})
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X_SMOTE, y_SMOTE = pipeline.fit_resample(X_train, y_train)
    ebm = ExplainableBoostingClassifier()
    ebm.fit(X_SMOTE, y_SMOTE)
    pred = ebm.predict(X_test)
    print(misclassification(pred, y_test))

  warn("Multiclass is still experimental. Subject to change per release.")


0.21141685182020167


  warn("Multiclass is still experimental. Subject to change per release.")


0.21090411895402494


  warn("Multiclass is still experimental. Subject to change per release.")


0.21175867373098617


  warn("Multiclass is still experimental. Subject to change per release.")


0.21175867373098617


  warn("Multiclass is still experimental. Subject to change per release.")


0.21056229704324048


In [11]:
pred = ebm.predict(X_train)
print(misclassification(pred, y_train))

0.21068376068376068


In [52]:
from interpret.glassbox import ClassificationTree

ebm_tree = ClassificationTree()
ebm_tree.fit(X_train, train_y)

<interpret.glassbox.decisiontree.ClassificationTree at 0x122c94d30>

In [55]:
ebm_global_tree = ebm_tree.explain_global(name='Classification Tree')
show(ebm_global_tree)

In [77]:
pred_tree = ebm_tree.predict(X_test)
sum = 0
for i in range(len(pred_tree)):
    if pred_tree[i] != test_y[i]:
        sum +=1
print(len(pred_tree) - sum)       
print(sum/len(pred_tree))

4575
0.21794871794871795


In [None]:
def cross_val(X, y)