In [4]:
import pandas as pd
import numpy as np

# Loading the necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score

from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN



cust = pd.read_csv('custody_ML.csv')
shoot = pd.read_csv('shootings_ML.csv')

In [5]:
cust.head()

Unnamed: 0.1,Unnamed: 0,dept,custody_type,race,sex,death_type,age
0,0,389,2,4,1,5,58.0
1,1,389,2,3,1,5,76.0
2,2,175,1,4,1,6,30.0
3,3,389,2,1,1,1,39.0
4,4,389,2,1,1,5,31.0


In [6]:
shoot.head()

Unnamed: 0.1,Unnamed: 0,age,fatality,armed,race,sex,dept,mult_officers
0,103,26.0,0,2,4,1,0,0
1,104,16.0,0,2,2,1,0,0
2,105,26.0,0,2,0,1,0,1
3,106,35.0,0,0,4,1,0,0
4,107,30.0,0,2,2,1,0,0


In [7]:
cust.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
shoot.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
#Scale cust df
X_col = cust.columns

cust_scaled = pd.DataFrame(StandardScaler().fit_transform(cust), columns= X_col)
cust_scaled.head()

Unnamed: 0,dept,custody_type,race,sex,death_type,age
0,0.555878,0.123491,1.156565,0.238506,0.155584,0.590814
1,0.555878,0.123491,0.406515,0.238506,0.155584,1.798521
2,-1.378794,-1.037655,1.156565,0.238506,0.943599,-1.287842
3,0.555878,0.123491,-1.093584,0.238506,-2.996475,-0.683988
4,0.555878,0.123491,-1.093584,0.238506,0.155584,-1.220747


In [10]:
#Scale shoot df
X_cols = shoot.columns

shoot_scaled = pd.DataFrame(StandardScaler().fit_transform(shoot), columns = X_cols)
shoot_scaled.head()

Unnamed: 0,age,fatality,armed,race,sex,dept,mult_officers
0,-0.670348,-1.387608,0.980761,1.548471,-0.543893,-1.900016,-0.591786
1,-1.71653,-1.387608,0.980761,-0.096378,-0.543893,-1.900016,-0.591786
2,-0.670348,-1.387608,0.980761,-1.741227,-0.543893,-1.900016,1.689799
3,0.271215,-1.387608,-1.040776,1.548471,-0.543893,-1.900016,-0.591786
4,-0.251876,-1.387608,0.980761,-0.096378,-0.543893,-1.900016,-0.591786


# Deaths in Custody

## KMeans Clustering

In [11]:
#KMeans Clustering of Custody data
k_range = range(2,11)

for i in k_range:
    model = KMeans(n_clusters=i,random_state=11)
    pred =model.fit_predict(cust_scaled)
    silhouette_avg = silhouette_score(cust_scaled,pred)
    print('The number of clusters, %d, and silhouette coefficient is %0.2f'% (i,silhouette_avg))

The number of clusters, 2, and silhouette coefficient is 0.36
The number of clusters, 3, and silhouette coefficient is 0.40
The number of clusters, 4, and silhouette coefficient is 0.43
The number of clusters, 5, and silhouette coefficient is 0.39
The number of clusters, 6, and silhouette coefficient is 0.40
The number of clusters, 7, and silhouette coefficient is 0.40
The number of clusters, 8, and silhouette coefficient is 0.40
The number of clusters, 9, and silhouette coefficient is 0.36
The number of clusters, 10, and silhouette coefficient is 0.34


## Affinity Propagation

In [12]:
# Affinity propagation with default parameters
affPro = AffinityPropagation()
affPro.fit_predict(cust_scaled)
labels = affPro.labels_
print("Silhouette Coefficient: %0.3f"% silhouette_score(cust_scaled, labels, metric='euclidean'))

KeyboardInterrupt: 

#### Tuning

In [13]:
damp = [0.95, 0.85, 0.75, 0.5]
conv = [7, 15, 23, 40]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(cust_scaled)
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(cust_scaled, labels, metric='euclidean'))

KeyboardInterrupt: 

## Spectral Clustering

In [14]:
# Spectral clustering
for k in range(2,11):
    spect = SpectralClustering(n_clusters=k,random_state=11)
    spect.fit_predict(cust_scaled)
    spectlabel = spect.labels_
    score= silhouette_score(cust_scaled, spectlabel, metric='euclidean')
    print("The number of clusters: %d, silhouette coefficient: %0.2f" % (k,score))

The number of clusters: 2, silhouette coefficient: 0.23
The number of clusters: 3, silhouette coefficient: 0.23
The number of clusters: 4, silhouette coefficient: 0.26
The number of clusters: 5, silhouette coefficient: 0.06
The number of clusters: 6, silhouette coefficient: 0.08
The number of clusters: 7, silhouette coefficient: 0.06
The number of clusters: 8, silhouette coefficient: 0.01
The number of clusters: 9, silhouette coefficient: 0.00
The number of clusters: 10, silhouette coefficient: 0.13


#### Tuning

In [35]:
gam = [0.01, 0.1, 0.25, 0.5, 1.0]

for g in gam:
    for k in range(2,11):
        spect = SpectralClustering(n_clusters=k, random_state=11, n_jobs =-1, gamma= g)
        spect.fit_predict(cust_scaled)
        spectlabel = spect.labels_
        score= silhouette_score(cust_scaled, spectlabel, metric='euclidean')
        print("The number of clusters: %d, gamma: %f, silhouette coefficient: %0.2f" % (k,g,score))

The number of clusters: 2, gamma: 0.010000, silhouette coefficient: 0.35
The number of clusters: 3, gamma: 0.010000, silhouette coefficient: 0.39
The number of clusters: 4, gamma: 0.010000, silhouette coefficient: 0.34
The number of clusters: 5, gamma: 0.010000, silhouette coefficient: 0.38
The number of clusters: 6, gamma: 0.010000, silhouette coefficient: 0.39
The number of clusters: 7, gamma: 0.010000, silhouette coefficient: 0.40
The number of clusters: 8, gamma: 0.010000, silhouette coefficient: 0.39
The number of clusters: 9, gamma: 0.010000, silhouette coefficient: 0.40
The number of clusters: 10, gamma: 0.010000, silhouette coefficient: 0.39
The number of clusters: 2, gamma: 0.100000, silhouette coefficient: 0.35
The number of clusters: 3, gamma: 0.100000, silhouette coefficient: 0.38
The number of clusters: 4, gamma: 0.100000, silhouette coefficient: 0.43
The number of clusters: 5, gamma: 0.100000, silhouette coefficient: 0.39
The number of clusters: 6, gamma: 0.100000, silhou

## Agglomerative

In [16]:
# Agglomerative Clustering with 2-10 clusters 
for k in range(2,11):
    agg = AgglomerativeClustering(n_clusters=k)
    agg.fit_predict(cust_scaled)
    labels = agg.labels_
    score= silhouette_score(cust_scaled, labels, metric='euclidean')
    print("The number of clusters: %d, silhouette coefficient: %0.2f" % (k,score))

The number of clusters: 2, silhouette coefficient: 0.34
The number of clusters: 3, silhouette coefficient: 0.39
The number of clusters: 4, silhouette coefficient: 0.42
The number of clusters: 5, silhouette coefficient: 0.37
The number of clusters: 6, silhouette coefficient: 0.39
The number of clusters: 7, silhouette coefficient: 0.40
The number of clusters: 8, silhouette coefficient: 0.40
The number of clusters: 9, silhouette coefficient: 0.36
The number of clusters: 10, silhouette coefficient: 0.35


## DBScan

In [17]:
db = DBSCAN()
db.fit_predict(cust_scaled)
labels = db.labels_
score= silhouette_score(cust_scaled, labels, metric='euclidean')
print("Silhouette Coefficient: %0.3f"% silhouette_score(cust_scaled, labels, metric='euclidean'))

Silhouette Coefficient: 0.079


#### Tuning

In [20]:
ep = [0.25, 0.5, 0.75, 1, 1.5, 2, 2.5, 3, 3.5, 3.75, 4] #only clustered into one group at 5
minsamp = [3, 4, 5, 6]

for e in ep:
    for m in minsamp:
        db = DBSCAN(eps=e, min_samples=m, n_jobs=-1)
        db.fit_predict(cust_scaled)
        labels = db.labels_
        print("eps = ", e, "min_samples = ", m)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(cust_scaled, labels, metric='euclidean'))

eps =  0.25 min_samples =  3
Silhouette Coefficient: -0.023
eps =  0.25 min_samples =  4
Silhouette Coefficient: -0.024
eps =  0.25 min_samples =  5
Silhouette Coefficient: -0.009
eps =  0.25 min_samples =  6
Silhouette Coefficient: 0.123
eps =  0.5 min_samples =  3
Silhouette Coefficient: 0.043
eps =  0.5 min_samples =  4
Silhouette Coefficient: 0.082
eps =  0.5 min_samples =  5
Silhouette Coefficient: 0.079
eps =  0.5 min_samples =  6
Silhouette Coefficient: -0.000
eps =  0.75 min_samples =  3
Silhouette Coefficient: 0.111
eps =  0.75 min_samples =  4
Silhouette Coefficient: 0.145
eps =  0.75 min_samples =  5
Silhouette Coefficient: 0.136
eps =  0.75 min_samples =  6
Silhouette Coefficient: 0.127
eps =  1 min_samples =  3
Silhouette Coefficient: 0.223
eps =  1 min_samples =  4
Silhouette Coefficient: 0.221
eps =  1 min_samples =  5
Silhouette Coefficient: 0.223
eps =  1 min_samples =  6
Silhouette Coefficient: 0.250
eps =  1.5 min_samples =  3
Silhouette Coefficient: 0.254
eps =  1.5

# Officer Involved Shootings

## Kmeans 

In [22]:
#KMeans Clustering of Custody data
k_range = range(2,11)

for i in k_range:
    model = KMeans(n_clusters=i,random_state=11)
    pred =model.fit_predict(shoot_scaled)
    silhouette_avg = silhouette_score(shoot_scaled,pred)
    print('The number of clusters, %d, and silhouette coefficient is %0.2f'% (i,silhouette_avg))

The number of clusters, 2, and silhouette coefficient is 0.17
The number of clusters, 3, and silhouette coefficient is 0.19
The number of clusters, 4, and silhouette coefficient is 0.20
The number of clusters, 5, and silhouette coefficient is 0.21
The number of clusters, 6, and silhouette coefficient is 0.19
The number of clusters, 7, and silhouette coefficient is 0.22
The number of clusters, 8, and silhouette coefficient is 0.23
The number of clusters, 9, and silhouette coefficient is 0.23
The number of clusters, 10, and silhouette coefficient is 0.24


## Affinity Propagation

In [23]:
# Affinity propagation with default parameters
affPro = AffinityPropagation()
affPro.fit_predict(shoot_scaled)
centers = affPro.cluster_centers_indices_
labels = affPro.labels_
print("Silhouette Coefficient: %0.3f"% silhouette_score(shoot_scaled, labels, metric='euclidean'))

Silhouette Coefficient: nan


  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)


#### Tuning 

In [32]:
damp = [0.5, 0.6,0.7, 0.8, 0.9] #only one cluster over 0.85
conv = [7, 15, 23, 50, 75]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(shoot_scaled)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(shoot_scaled, labels, metric='euclidean'))

damping =  0.5 convergence_iter =  7
Silhouette Coefficient: nan


  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)


damping =  0.5 convergence_iter =  15
Silhouette Coefficient: nan
damping =  0.5 convergence_iter =  23
Silhouette Coefficient: nan
damping =  0.5 convergence_iter =  50
Silhouette Coefficient: nan
damping =  0.5 convergence_iter =  75
Silhouette Coefficient: nan
damping =  0.6 convergence_iter =  7
Silhouette Coefficient: 0.408
damping =  0.6 convergence_iter =  15
Silhouette Coefficient: 0.408
damping =  0.6 convergence_iter =  23
Silhouette Coefficient: 0.408
damping =  0.6 convergence_iter =  50
Silhouette Coefficient: 0.408
damping =  0.6 convergence_iter =  75
Silhouette Coefficient: 0.408
damping =  0.7 convergence_iter =  7
Silhouette Coefficient: 0.392
damping =  0.7 convergence_iter =  15
Silhouette Coefficient: 0.399
damping =  0.7 convergence_iter =  23
Silhouette Coefficient: 0.399
damping =  0.7 convergence_iter =  50
Silhouette Coefficient: 0.406
damping =  0.7 convergence_iter =  75
Silhouette Coefficient: 0.406
damping =  0.8 convergence_iter =  7
Silhouette Coefficien

## Spectral Clustering

In [25]:
# Spectral clustering of Shooting data
for k in range(2,11):
    spect = SpectralClustering(n_clusters=k,random_state=11)
    spect.fit_predict(shoot_scaled)
    spectlabel = spect.labels_
    score=silhouette_score(shoot_scaled, spectlabel, metric='euclidean')
    print("The number of clusters, %d, and silhouette coefficient is %0.2f" % (k,score))

The number of clusters, 2, and silhouette coefficient is 0.07
The number of clusters, 3, and silhouette coefficient is 0.08
The number of clusters, 4, and silhouette coefficient is 0.07
The number of clusters, 5, and silhouette coefficient is 0.05
The number of clusters, 6, and silhouette coefficient is 0.09
The number of clusters, 7, and silhouette coefficient is 0.08
The number of clusters, 8, and silhouette coefficient is 0.11
The number of clusters, 9, and silhouette coefficient is 0.08
The number of clusters, 10, and silhouette coefficient is 0.11


#### Tuning 

In [26]:
gam = [0.5, 1.0, 1.5]

for g in gam:
    for k in range(2,11):
        spect = SpectralClustering(n_clusters=k, random_state=11, n_jobs =-1, gamma= g)
        spect.fit_predict(shoot_scaled)
        spectlabel = spect.labels_
        score= silhouette_score(shoot_scaled, spectlabel, metric='euclidean')
        print("The number of clusters: %d, gamma: %f, silhouette coefficient: %0.2f" % (k,g,score))

The number of clusters: 2, gamma: 0.500000, silhouette coefficient: 0.08
The number of clusters: 3, gamma: 0.500000, silhouette coefficient: 0.09
The number of clusters: 4, gamma: 0.500000, silhouette coefficient: 0.10
The number of clusters: 5, gamma: 0.500000, silhouette coefficient: 0.09
The number of clusters: 6, gamma: 0.500000, silhouette coefficient: 0.11
The number of clusters: 7, gamma: 0.500000, silhouette coefficient: 0.12
The number of clusters: 8, gamma: 0.500000, silhouette coefficient: 0.13
The number of clusters: 9, gamma: 0.500000, silhouette coefficient: 0.14
The number of clusters: 10, gamma: 0.500000, silhouette coefficient: 0.17
The number of clusters: 2, gamma: 1.000000, silhouette coefficient: 0.07
The number of clusters: 3, gamma: 1.000000, silhouette coefficient: 0.08
The number of clusters: 4, gamma: 1.000000, silhouette coefficient: 0.07
The number of clusters: 5, gamma: 1.000000, silhouette coefficient: 0.05
The number of clusters: 6, gamma: 1.000000, silhou

## Agglomerative Clustering

In [27]:
# Agglomerative Clustering with 2-10 clusters 
for k in range(2,11):
    agg = AgglomerativeClustering(n_clusters=k)
    agg.fit_predict(shoot_scaled)
    labels = agg.labels_
    score= silhouette_score(shoot_scaled, labels, metric='euclidean')
    print("The number of clusters: %d, silhouette coefficient: %0.2f" % (k,score))

The number of clusters: 2, silhouette coefficient: 0.16
The number of clusters: 3, silhouette coefficient: 0.20
The number of clusters: 4, silhouette coefficient: 0.19
The number of clusters: 5, silhouette coefficient: 0.18
The number of clusters: 6, silhouette coefficient: 0.18
The number of clusters: 7, silhouette coefficient: 0.18
The number of clusters: 8, silhouette coefficient: 0.20
The number of clusters: 9, silhouette coefficient: 0.22
The number of clusters: 10, silhouette coefficient: 0.19


## DBScan

#### Tuning

In [34]:
ep = [0.25, 0.5, 0.75, 1, 2] #groups into one cluster at 3
minsamp = [2, 3, 4] #groups into one cluster at 5

for e in ep:
    for m in minsamp:
        db = DBSCAN(eps=e, min_samples=m, n_jobs=-1)
        db.fit_predict(shoot_scaled)
        labels = db.labels_
        print("eps = ", e, "min_samples = ", m)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(shoot_scaled, labels, metric='euclidean'))

eps =  0.25 min_samples =  2
Silhouette Coefficient: 0.176
eps =  0.25 min_samples =  3
Silhouette Coefficient: 0.015
eps =  0.25 min_samples =  4
Silhouette Coefficient: -0.071
eps =  0.5 min_samples =  2
Silhouette Coefficient: 0.300
eps =  0.5 min_samples =  3
Silhouette Coefficient: 0.150
eps =  0.5 min_samples =  4
Silhouette Coefficient: 0.051
eps =  0.75 min_samples =  2
Silhouette Coefficient: 0.343
eps =  0.75 min_samples =  3
Silhouette Coefficient: 0.243
eps =  0.75 min_samples =  4
Silhouette Coefficient: 0.147
eps =  1 min_samples =  2
Silhouette Coefficient: 0.224
eps =  1 min_samples =  3
Silhouette Coefficient: 0.212
eps =  1 min_samples =  4
Silhouette Coefficient: 0.195
eps =  2 min_samples =  2
Silhouette Coefficient: 0.077
eps =  2 min_samples =  3
Silhouette Coefficient: 0.084
eps =  2 min_samples =  4
Silhouette Coefficient: 0.091


# Testing/ Checking labels

### Officer Involved Shootings

In [33]:
#damping =  0.8 convergence_iter =  50
#Silhouette Coefficient: 0.412

affPro = AffinityPropagation(damping= 0.8, convergence_iter=507)
affPro.fit_predict(shoot_scaled)
centers = affPro.cluster_centers_indices_ 
labels = affPro.labels_
print(labels)
print(centers)

[ 3  6  7 12  6  0  4 14  4  1  8 12  4  9  2 15 10  1  8 10 15  9  4  4 11
  2  2  3  9  3 16  3  2  3  4  0 12  8 11 16 13  8 16  4  1 13 12 14 34  1
  2  7  8 14  5  1  4 13  9 11 14  4  4  5  4  4 13  6  4 21  9  6  7  2  9
 42  6  4 11 14  4  4  7  6 13  8  4  1  4  6  8  9  9 14  6  4  9 13  4  5
 13 14  9  8 13 11  4  5 15  8 10  4 10 15  9  5 12 11  5  8 14  7 12 15 13
  6  6  9 12  4  8 18 11  5 13 11 34  8 14  7  5  4  5  5  5  7  5  5  6  5
  4  6  8  5  4 18  0 18 12 18  1  4 27 18 15 12 12 11  3 17 11 18 10 16 34
 14 37 18 16 10 32 23 35 18 21 32 35 31 25 15 33 28 19 33 29 33 35 17 26  2
 17 31 26 35 29 17 17 15 35 18 19 21 33 40 15 15 29 18 23 24 39 40 23 30 20
 23 21 20 26 19 21 31 26 23 22 31 23 21 39 28 23 35 31 31 28 19 23 31 28 26
 19 23 17 23 46 24 20 22 18 24 37 39 22 32 23 35 24 36 25 21 35 20 24 26 17
 31 31 35 31 31 29 23 21 31 28 26 20 26 29 21 32 34 29 23 23 19 27 21 26 24
 35 34 28 23 21 25 31 32 19 31 23 29 21 35 18 28 32 23 36 27 22 38 21 20 22
 35 21 19 29

In [31]:
affPro = AffinityPropagation(damping= 0.6, convergence_iter= 7)
affPro.fit_predict(shoot_scaled)
centers = affPro.cluster_centers_indices_ 
labels = affPro.labels_
print(labels)
print(centers)

[ 2  4  5 12  4  6  7 15  7  0  8 12 13  9  1 20 10  0  8  7  9  6 13 13 11
  1  1  2  6  2 16  2  1  2  7  6 12  8 11 16 14  8 16  7  0 14 12 15 37  0
  1  5  8 15  3  0 13 14  6 11 15 13 13  3  7 13 14  4  7  7  6  4  5  1  6
 50  4  7 11 15  7  7  5  4 14  8  7  0 13  4  8  9  9 15  4 13  6 14 13  3
 14 15  6  8 14 11 13  3 20  8 10 13 10 20  6  3 12 11  3  8 15  5 12  9 14
  4  4  6 12 13  8 18 11  3 14 11 37  8 15  5  3 13  3  3  3  5  3  3  4  3
  7  4  8  3  7 18  6 18 12 18  0 13 30 18 20 12 12 11  2 17 11 18 10 16 37
 15 39 18 16 10 35 32 29 34 27 35 29 26 24 20 36 31 19 36 42 36 29 17 28  1
 17 26 28 34 32 17 17 20 29 18 19 25 36 42 20 20 32 18 29 23 41 34 29 33 21
 29 25 21 28 19 27 26 28 32 22 26 27 27 41 31 32 29 26 27 31 19 29 26 31 28
 19 42 17 29 50 23 21 22 18 23 39 41 22 35 29 29 23 38 24 25 29 21 23 28 17
 26 27 29 26 26 33 27 27 26 31 28 21 28 32 27 35 37 32 29 29 19 30 27 28 23
 29 37 31 29 27 24 27 35 19 26 32 32 25 29 18 31 35 29 38 30 22 40 25 21 22
 29 25 19 32

### Deaths in Custody

In [21]:
#Silhouette Coefficient: 0.487
dbcust = DBSCAN(eps= 3, min_samples=3, n_jobs=-1)
dbcust.fit_predict(cust_scaled)
labels = dbcust.labels_

#core = dbshoot.core_samples_
#print("Silhouette Coefficient: %0.3f"% silhouette_score(shoot_scaled, labels, metric='euclidean'))

print(labels)

[0 0 0 ..., 0 0 0]


In [36]:
#The number of clusters: 4, gamma: 0.100000, silhouette coefficient: 0.43
spect = SpectralClustering(n_clusters=4, random_state=11, n_jobs =-1, gamma= 0.1)
spect.fit_predict(cust_scaled)
labels = spect.labels_
print(labels)

[0 0 2 ..., 1 0 0]


In [37]:
#The number of clusters, 4, and silhouette coefficient is 0.43
model = KMeans(n_clusters=4,random_state=11)
model.fit_predict(cust_scaled)
print(model.labels_)

[1 1 0 ..., 2 1 1]
