In [1]:
import pandas as pd
import numpy as np

# Loading the necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score

from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN



cust = pd.read_csv('custody_ML.csv')
shoot = pd.read_csv('shootings_ML.csv')



In [2]:
cust.head()

Unnamed: 0.1,Unnamed: 0,dept,custody_type,facility,race,sex,death_type,charge_status,age,year,month
0,0,389,2,134,4,1,5,0,58.0,2012,9
1,1,389,2,115,3,1,5,0,76.0,2016,9
2,2,175,1,0,4,1,6,2,30.0,2016,12
3,3,389,2,60,1,1,1,0,39.0,2011,11
4,4,389,2,77,1,1,5,0,31.0,2014,12


In [3]:
shoot.head()

Unnamed: 0.1,Unnamed: 0,age,number_officers,fatality,armed,race,sex,stop_reason,officer_race,dept,year,month
0,103,26.0,1.0,0,2,4,1,23,13,0,2010,9
1,104,16.0,1.0,0,2,2,1,13,40,0,2010,10
2,105,26.0,2.0,0,2,0,1,66,48,0,2010,11
3,106,35.0,1.0,0,0,4,1,86,5,0,2010,12
4,107,30.0,1.0,0,2,2,1,66,13,0,2011,5


In [4]:
cust.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
shoot.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
#Scale cust df
X_col = cust.columns

cust_scaled = pd.DataFrame(StandardScaler().fit_transform(cust), columns= X_col)
cust_scaled.head()

Unnamed: 0,dept,custody_type,facility,race,sex,death_type,charge_status,age,year,month
0,0.555878,0.123491,0.230097,1.156565,0.238506,0.155584,-0.605742,0.590814,0.412088,0.723706
1,0.555878,0.123491,-0.095784,0.406515,0.238506,0.155584,-0.605742,1.798521,1.579194,0.723706
2,-1.378794,-1.037655,-2.068228,1.156565,0.238506,0.943599,1.487142,-1.287842,1.579194,1.595334
3,0.555878,0.123491,-1.039127,-1.093584,0.238506,-2.996475,-0.605742,-0.683988,0.120312,1.304791
4,0.555878,0.123491,-0.747548,-1.093584,0.238506,0.155584,-0.605742,-1.220747,0.995641,1.595334


In [7]:
#Scale shoot df
X_cols = shoot.columns

shoot_scaled = pd.DataFrame(StandardScaler().fit_transform(shoot), columns = X_cols)
shoot_scaled.head()

Unnamed: 0,age,number_officers,fatality,armed,race,sex,stop_reason,officer_race,dept,year,month
0,-0.670348,-0.39433,-1.387608,0.980761,1.548471,-0.543893,-1.792733,-1.051157,-1.900016,-1.560899,0.76152
1,-1.71653,-0.39433,-1.387608,0.980761,-0.096378,-0.543893,-2.118722,0.439038,-1.900016,-1.560899,1.051969
2,-0.670348,0.469955,-1.387608,0.980761,-1.741227,-0.543893,-0.390982,0.880578,-1.900016,-1.560899,1.342418
3,0.271215,-0.39433,-1.387608,-1.040776,1.548471,-0.543893,0.260995,-1.492696,-1.900016,-1.560899,1.632866
4,-0.251876,-0.39433,-1.387608,0.980761,-0.096378,-0.543893,-0.390982,-1.051157,-1.900016,-1.048604,-0.400275


# Deaths in Custody

In [8]:
#create training and testing groups of Custody data
train_cust, test_cust= train_test_split(cust_scaled, test_size = 0.2, random_state = 11)

## KMeans Clustering

In [9]:
#KMeans Clustering of Custody data
k_range = range(2,11)

for i in k_range:
    model = KMeans(n_clusters=i,random_state=11)
    pred =model.fit_predict(train_cust)
    silhouette_avg = silhouette_score(train_cust,pred)
    print('The number of clusters, %d, and silhouette coefficient is %0.2f'% (i,silhouette_avg))

The number of clusters, 2, and silhouette coefficient is 0.28
The number of clusters, 3, and silhouette coefficient is 0.31
The number of clusters, 4, and silhouette coefficient is 0.32
The number of clusters, 5, and silhouette coefficient is 0.18
The number of clusters, 6, and silhouette coefficient is 0.17
The number of clusters, 7, and silhouette coefficient is 0.18
The number of clusters, 8, and silhouette coefficient is 0.18
The number of clusters, 9, and silhouette coefficient is 0.18
The number of clusters, 10, and silhouette coefficient is 0.17


## Affinity Propagation

In [24]:
# Affinity propagation with default parameters
affPro = AffinityPropagation()
affPro.fit_predict(train_cust)
centers = affPro.cluster_centers_indices_
labels = affPro.labels_
print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

Silhouette Coefficient: 0.162


#### Tuning

In [21]:
damp = [0.5, 0.75, .99]
conv = [7, 15, 23]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(train_cust)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

damping =  0.5 convergence_iter =  7
Silhouette Coefficient: 0.162
damping =  0.5 convergence_iter =  15
Silhouette Coefficient: 0.162
damping =  0.5 convergence_iter =  23
Silhouette Coefficient: 0.162
damping =  0.75 convergence_iter =  7
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  15
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  23
Silhouette Coefficient: 0.167
damping =  0.99 convergence_iter =  7


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

## Spectral Clustering

In [25]:
# Spectral clustering
for k in range(2,11):
    spect = SpectralClustering(n_clusters=k,random_state=11)
    spect.fit_predict(train_cust)
    spectlabel = spect.labels_
    score= silhouette_score(train_cust, spectlabel, metric='euclidean')
    print("The number of clusters: %d, silhouette coefficient: %0.2f" % (k,score))

The number of clusters, 2, and silhouette coefficient is 0.09
The number of clusters, 3, and silhouette coefficient is -0.06
The number of clusters, 4, and silhouette coefficient is -0.06
The number of clusters, 5, and silhouette coefficient is -0.07
The number of clusters, 6, and silhouette coefficient is -0.09
The number of clusters, 7, and silhouette coefficient is -0.09
The number of clusters, 8, and silhouette coefficient is -0.10
The number of clusters, 9, and silhouette coefficient is -0.10
The number of clusters, 10, and silhouette coefficient is -0.11


#### Tuning

In [None]:
gam = [0.5, 1.0, 1.5]

for g in gam:
    for k in range(2,11):
        spect = SpectralClustering(n_clusters=k, random_state=11, n_jobs =-1, gamma= g)
        spect.fit_predict(train_cust)
        spectlabel = spect.labels_
        score= silhouette_score(train_cust, spectlabel, metric='euclidean')
        print("The number of clusters: %d, gamma: %f, silhouette coefficient: %0.2f" % (k,g,score))

gamma =  0.5
The number of clusters: 2, gamma: 0.500000, silhouette coefficient: 0.13
gamma =  0.5
The number of clusters: 3, gamma: 0.500000, silhouette coefficient: 0.14
gamma =  0.5
The number of clusters: 4, gamma: 0.500000, silhouette coefficient: -0.05
gamma =  0.5
The number of clusters: 5, gamma: 0.500000, silhouette coefficient: -0.04
gamma =  0.5
The number of clusters: 6, gamma: 0.500000, silhouette coefficient: -0.07
gamma =  0.5
The number of clusters: 7, gamma: 0.500000, silhouette coefficient: -0.06
gamma =  0.5
The number of clusters: 8, gamma: 0.500000, silhouette coefficient: -0.06
gamma =  0.5
The number of clusters: 9, gamma: 0.500000, silhouette coefficient: -0.08
gamma =  0.5
The number of clusters: 10, gamma: 0.500000, silhouette coefficient: -0.08
gamma =  1.0
The number of clusters: 2, gamma: 1.000000, silhouette coefficient: 0.09
gamma =  1.0
The number of clusters: 3, gamma: 1.000000, silhouette coefficient: -0.06
gamma =  1.0
The number of clusters: 4, gamma

## Agglomerative

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering

#### Tuning

In [None]:
''''
damp = [0.5,0.75, 1]
maxit = [100, 200, 300]
conv = [7, 15, 23]

for d in damp:
    for m in maxit:
        for c in conv:
            affPro = AffinityPropagation(damping= d, max_iter= m, convergence_iter= c)
            affPro.fit_predict(train_cust)
            centers = affPro.cluster_centers_indices_ 
            labels = affPro.labels_
            print("damping = ", d, "max_iter = ", m, "convergence_iter = ",c)''''''
            print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

## DBScan

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN

#### Tuning

In [None]:
'''damp = [0.5,0.75, 1]
maxit = [100, 200, 300]
conv = [7, 15, 23]

for d in damp:
    for m in maxit:
        for c in conv:
            affPro = AffinityPropagation(damping= d, max_iter= m, convergence_iter= c)
            affPro.fit_predict(train_cust)
            centers = affPro.cluster_centers_indices_ 
            labels = affPro.labels_
            print("damping = ", d, "max_iter = ", m, "convergence_iter = ",c)
            print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))'''

# Officer Involved Shootings

In [None]:
#create training and testing groups of Shootings data
train_shoot, test_shoot= train_test_split(shoot_scaled, test_size = 0.2, random_state = 11)

## Kmeans 

In [None]:
#KMeans Clustering of Custody data
k_range = range(2,11)

for i in k_range:
    model = KMeans(n_clusters=i,random_state=11)
    pred =model.fit_predict(train_shoot)
    silhouette_avg = silhouette_score(train_shoot,pred)
    print('The number of clusters, %d, and silhouette coefficient is %0.2f'% (i,silhouette_avg))

## Affinity Propagation

In [None]:
# Affinity propagation with default parameters
affPro = AffinityPropagation()
affPro.fit_predict(train_shoot)
centers = affPro.cluster_centers_indices_
labels = affPro.labels_
print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

#### Tuning 

In [19]:
damp = [0.5,0.75, 1]
conv = [7, 15, 23]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(train_shoot)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

KeyboardInterrupt: 

## Spectral Clustering

In [None]:
# Spectral clustering of Shooting data
for k in range(2,11):
    spect = SpectralClustering(n_clusters=k,random_state=11)
    spect.fit_predict(train_shoot)
    spectlabel = spect.labels_
    score=silhouette_score(train_shoot, spectlabel, metric='euclidean')
    print("The number of clusters, %d, and silhouette coefficient is %0.2f" % (k,score))

#### Tuning 

In [None]:
gam = [0.5, 1.0, 1.5]

for g in gam:
    for k in range(2,11):
        spect = SpectralClustering(n_clusters=k, random_state=11, n_jobs =-1, gamma= g)
        spect.fit_predict(train_shoot)
        spectlabel = spect.labels_
        score= silhouette_score(train_shoot, spectlabel, metric='euclidean')
        print("The number of clusters: %d, gamma: %f, silhouette coefficient: %0.2f" % (k,g,score))

## Agglomerative Clustering

#### Tuning 

In [None]:
'''damp = [0.5,0.75, 1]
maxit = [100, 200, 300]
conv = [7, 15, 23]

for d in damp:
    for m in maxit:
        for c in conv:
            affPro = AffinityPropagation(damping= d, max_iter= m, convergence_iter= c)
            affPro.fit_predict(train_cust)
            centers = affPro.cluster_centers_indices_ 
            labels = affPro.labels_
            print("damping = ", d, "max_iter = ", m, "convergence_iter = ",c)
            print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))'''

## DBScan

#### Tuning

In [None]:
'''damp = [0.5,0.75, 1]
maxit = [100, 200, 300]
conv = [7, 15, 23]

for d in damp:
    for m in maxit:
        for c in conv:
            affPro = AffinityPropagation(damping= d, max_iter= m, convergence_iter= c)
            affPro.fit_predict(train_cust)
            centers = affPro.cluster_centers_indices_ 
            labels = affPro.labels_
            print("damping = ", d, "max_iter = ", m, "convergence_iter = ",c)
            print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))'''