In [1]:
import pandas as pd
import numpy as np

# Loading the necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score

from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN



cust = pd.read_csv('custody_ML.csv')
shoot = pd.read_csv('shootings_ML.csv')



In [2]:
cust.head()

Unnamed: 0.1,Unnamed: 0,dept,custody_type,facility,race,sex,death_type,charge_status,age,year,month
0,0,389,2,134,4,1,5,0,58.0,2012,9
1,1,389,2,115,3,1,5,0,76.0,2016,9
2,2,175,1,0,4,1,6,2,30.0,2016,12
3,3,389,2,60,1,1,1,0,39.0,2011,11
4,4,389,2,77,1,1,5,0,31.0,2014,12


In [3]:
shoot.head()

Unnamed: 0.1,Unnamed: 0,age,number_officers,fatality,armed,race,sex,stop_reason,officer_race,dept,year,month
0,103,26.0,1.0,0,2,4,1,23,13,0,2010,9
1,104,16.0,1.0,0,2,2,1,13,40,0,2010,10
2,105,26.0,2.0,0,2,0,1,66,48,0,2010,11
3,106,35.0,1.0,0,0,4,1,86,5,0,2010,12
4,107,30.0,1.0,0,2,2,1,66,13,0,2011,5


In [2]:
cust.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
shoot.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
#Scale cust df
X_col = cust.columns

cust_scaled = pd.DataFrame(StandardScaler().fit_transform(cust), columns= X_col)
cust_scaled.head()

Unnamed: 0,dept,custody_type,facility,race,sex,death_type,charge_status,age,year,month
0,0.555878,0.123491,0.230097,1.156565,0.238506,0.155584,-0.605742,0.590814,0.412088,0.723706
1,0.555878,0.123491,-0.095784,0.406515,0.238506,0.155584,-0.605742,1.798521,1.579194,0.723706
2,-1.378794,-1.037655,-2.068228,1.156565,0.238506,0.943599,1.487142,-1.287842,1.579194,1.595334
3,0.555878,0.123491,-1.039127,-1.093584,0.238506,-2.996475,-0.605742,-0.683988,0.120312,1.304791
4,0.555878,0.123491,-0.747548,-1.093584,0.238506,0.155584,-0.605742,-1.220747,0.995641,1.595334


In [5]:
#Scale shoot df
X_cols = shoot.columns

shoot_scaled = pd.DataFrame(StandardScaler().fit_transform(shoot), columns = X_cols)
shoot_scaled.head()

Unnamed: 0,age,number_officers,fatality,armed,race,sex,stop_reason,officer_race,dept,year,month
0,-0.670348,-0.39433,-1.387608,0.980761,1.548471,-0.543893,-1.792733,-1.051157,-1.900016,-1.560899,0.76152
1,-1.71653,-0.39433,-1.387608,0.980761,-0.096378,-0.543893,-2.118722,0.439038,-1.900016,-1.560899,1.051969
2,-0.670348,0.469955,-1.387608,0.980761,-1.741227,-0.543893,-0.390982,0.880578,-1.900016,-1.560899,1.342418
3,0.271215,-0.39433,-1.387608,-1.040776,1.548471,-0.543893,0.260995,-1.492696,-1.900016,-1.560899,1.632866
4,-0.251876,-0.39433,-1.387608,0.980761,-0.096378,-0.543893,-0.390982,-1.051157,-1.900016,-1.048604,-0.400275


# Deaths in Custody

In [6]:
#create training and testing groups of Custody data
train_cust, test_cust= train_test_split(cust_scaled, test_size = 0.2, random_state = 11)

## KMeans Clustering

In [9]:
#KMeans Clustering of Custody data
k_range = range(2,11)

for i in k_range:
    model = KMeans(n_clusters=i,random_state=11)
    pred =model.fit_predict(train_cust)
    silhouette_avg = silhouette_score(train_cust,pred)
    print('The number of clusters, %d, and silhouette coefficient is %0.2f'% (i,silhouette_avg))

The number of clusters, 2, and silhouette coefficient is 0.28
The number of clusters, 3, and silhouette coefficient is 0.31
The number of clusters, 4, and silhouette coefficient is 0.32
The number of clusters, 5, and silhouette coefficient is 0.18
The number of clusters, 6, and silhouette coefficient is 0.17
The number of clusters, 7, and silhouette coefficient is 0.18
The number of clusters, 8, and silhouette coefficient is 0.18
The number of clusters, 9, and silhouette coefficient is 0.18
The number of clusters, 10, and silhouette coefficient is 0.17


## Affinity Propagation

In [24]:
# Affinity propagation with default parameters
affPro = AffinityPropagation()
affPro.fit_predict(train_cust)
centers = affPro.cluster_centers_indices_
labels = affPro.labels_
print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

Silhouette Coefficient: 0.162


#### Tuning

In [21]:
#attempt 1
damp = [0.5, 0.75, .99]
conv = [7, 15, 23]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(train_cust)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

damping =  0.5 convergence_iter =  7
Silhouette Coefficient: 0.162
damping =  0.5 convergence_iter =  15
Silhouette Coefficient: 0.162
damping =  0.5 convergence_iter =  23
Silhouette Coefficient: 0.162
damping =  0.75 convergence_iter =  7
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  15
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  23
Silhouette Coefficient: 0.167
damping =  0.99 convergence_iter =  7


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [9]:
#attempt 2
damp = [.95, 0.75, 0.5]
conv = [7, 15, 23]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(train_cust)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

damping =  0.95 convergence_iter =  7
Silhouette Coefficient: 0.166
damping =  0.95 convergence_iter =  15
Silhouette Coefficient: 0.167
damping =  0.95 convergence_iter =  23
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  7
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  15
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  23
Silhouette Coefficient: 0.167
damping =  0.5 convergence_iter =  7
Silhouette Coefficient: 0.162
damping =  0.5 convergence_iter =  15
Silhouette Coefficient: 0.162
damping =  0.5 convergence_iter =  23
Silhouette Coefficient: 0.162


In [10]:
#attempt 3
damp = [0.95, 0.85, 0.75]
conv = [15, 23, 40]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(train_cust)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

damping =  0.95 convergence_iter =  15
Silhouette Coefficient: 0.167
damping =  0.95 convergence_iter =  23
Silhouette Coefficient: 0.167
damping =  0.95 convergence_iter =  40
Silhouette Coefficient: 0.167
damping =  0.85 convergence_iter =  15
Silhouette Coefficient: 0.166
damping =  0.85 convergence_iter =  23
Silhouette Coefficient: 0.166
damping =  0.85 convergence_iter =  40
Silhouette Coefficient: 0.166
damping =  0.75 convergence_iter =  15
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  23
Silhouette Coefficient: 0.167
damping =  0.75 convergence_iter =  40
Silhouette Coefficient: 0.167


## Spectral Clustering

In [25]:
# Spectral clustering
for k in range(2,11):
    spect = SpectralClustering(n_clusters=k,random_state=11)
    spect.fit_predict(train_cust)
    spectlabel = spect.labels_
    score= silhouette_score(train_cust, spectlabel, metric='euclidean')
    print("The number of clusters: %d, silhouette coefficient: %0.2f" % (k,score))

The number of clusters, 2, and silhouette coefficient is 0.09
The number of clusters, 3, and silhouette coefficient is -0.06
The number of clusters, 4, and silhouette coefficient is -0.06
The number of clusters, 5, and silhouette coefficient is -0.07
The number of clusters, 6, and silhouette coefficient is -0.09
The number of clusters, 7, and silhouette coefficient is -0.09
The number of clusters, 8, and silhouette coefficient is -0.10
The number of clusters, 9, and silhouette coefficient is -0.10
The number of clusters, 10, and silhouette coefficient is -0.11


#### Tuning

In [11]:
gam = [0.5, 1.0, 2]

for g in gam:
    for k in range(2,11):
        spect = SpectralClustering(n_clusters=k, random_state=11, n_jobs =-1, gamma= g)
        spect.fit_predict(train_cust)
        spectlabel = spect.labels_
        score= silhouette_score(train_cust, spectlabel, metric='euclidean')
        print("The number of clusters: %d, gamma: %f, silhouette coefficient: %0.2f" % (k,g,score))

The number of clusters: 2, gamma: 0.500000, silhouette coefficient: 0.13
The number of clusters: 3, gamma: 0.500000, silhouette coefficient: 0.14
The number of clusters: 4, gamma: 0.500000, silhouette coefficient: -0.05
The number of clusters: 5, gamma: 0.500000, silhouette coefficient: -0.04
The number of clusters: 6, gamma: 0.500000, silhouette coefficient: -0.07
The number of clusters: 7, gamma: 0.500000, silhouette coefficient: -0.06
The number of clusters: 8, gamma: 0.500000, silhouette coefficient: -0.06
The number of clusters: 9, gamma: 0.500000, silhouette coefficient: -0.08
The number of clusters: 10, gamma: 0.500000, silhouette coefficient: -0.08
The number of clusters: 2, gamma: 1.000000, silhouette coefficient: 0.09
The number of clusters: 3, gamma: 1.000000, silhouette coefficient: -0.06
The number of clusters: 4, gamma: 1.000000, silhouette coefficient: -0.06
The number of clusters: 5, gamma: 1.000000, silhouette coefficient: -0.07
The number of clusters: 6, gamma: 1.0000

KeyboardInterrupt: 

## Agglomerative

In [12]:
# Agglomerative Clustering with 2-10 clusters 
for k in range(2,11):
    agg = AgglomerativeClustering(n_clusters=k)
    agg.fit_predict(train_cust)
    labels = agg.labels_
    score= silhouette_score(train_cust, labels, metric='euclidean')
    print("The number of clusters: %d, silhouette coefficient: %0.2f" % (k,score))

The number of clusters: 2, silhouette coefficient: 0.26
The number of clusters: 3, silhouette coefficient: 0.30
The number of clusters: 4, silhouette coefficient: 0.30
The number of clusters: 5, silhouette coefficient: 0.17
The number of clusters: 6, silhouette coefficient: 0.19
The number of clusters: 7, silhouette coefficient: 0.19
The number of clusters: 8, silhouette coefficient: 0.17
The number of clusters: 9, silhouette coefficient: 0.17
The number of clusters: 10, silhouette coefficient: 0.16


## DBScan

In [13]:
db = DBSCAN()
db.fit_predict(train_cust)
labels = db.labels_
score= silhouette_score(train_cust, labels, metric='euclidean')
print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

Silhouette Coefficient: -0.398


#### Tuning

In [18]:
ep = [0.25, 0.5, 0.75]
minsamp = [3, 5, 7]
leaf = [15, 30, 45]

for e in ep:
    for m in minsamp:
        for l in leaf:
            db = DBSCAN(eps=e, min_samples=m, leaf_size=l, n_jobs=-1)
            db.fit_predict(train_cust)
            labels = db.labels_
            print("eps = ", e, "min_samples = ", m, "leaf_size = ",l)
            print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

eps =  0.25 min_samples =  3 leaf_size =  15
Silhouette Coefficient: -0.471
eps =  0.25 min_samples =  3 leaf_size =  30
Silhouette Coefficient: -0.471
eps =  0.25 min_samples =  3 leaf_size =  45
Silhouette Coefficient: -0.471
eps =  0.25 min_samples =  5 leaf_size =  15
Silhouette Coefficient: -0.279
eps =  0.25 min_samples =  5 leaf_size =  30
Silhouette Coefficient: -0.279
eps =  0.25 min_samples =  5 leaf_size =  45
Silhouette Coefficient: -0.279
eps =  0.25 min_samples =  7 leaf_size =  15


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [25]:
ep = [0.25, 0.5, 0.75]
minsamp = [6, 5, 4, 3]
leaf = [15, 30, 45]

for e in ep:
    for m in minsamp:
        for l in leaf:
            db = DBSCAN(eps=e, min_samples=m, leaf_size=l, n_jobs=-1)
            db.fit_predict(train_cust)
            labels = db.labels_
            print("eps = ", e, "min_samples = ", m, "leaf_size = ",l)
            print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

eps =  0.25 min_samples =  6 leaf_size =  15
Silhouette Coefficient: -0.244
eps =  0.25 min_samples =  6 leaf_size =  30
Silhouette Coefficient: -0.244
eps =  0.25 min_samples =  6 leaf_size =  45
Silhouette Coefficient: -0.244
eps =  0.25 min_samples =  5 leaf_size =  15
Silhouette Coefficient: -0.279
eps =  0.25 min_samples =  5 leaf_size =  30
Silhouette Coefficient: -0.279
eps =  0.25 min_samples =  5 leaf_size =  45
Silhouette Coefficient: -0.279
eps =  0.25 min_samples =  4 leaf_size =  15
Silhouette Coefficient: -0.437
eps =  0.25 min_samples =  4 leaf_size =  30
Silhouette Coefficient: -0.437
eps =  0.25 min_samples =  4 leaf_size =  45
Silhouette Coefficient: -0.437
eps =  0.25 min_samples =  3 leaf_size =  15
Silhouette Coefficient: -0.471
eps =  0.25 min_samples =  3 leaf_size =  30
Silhouette Coefficient: -0.471
eps =  0.25 min_samples =  3 leaf_size =  45
Silhouette Coefficient: -0.471
eps =  0.5 min_samples =  6 leaf_size =  15
Silhouette Coefficient: -0.370
eps =  0.5 mi

In [28]:
ep = [0.25, 0.5, 0.75, 1, 1.5, 2, 2.5, 3, 3.5, 3.75, 4, 5]
minsamp = [3, 4, 5, 6]

for e in ep:
    for m in minsamp:
        db = DBSCAN(eps=e, min_samples=m, n_jobs=-1)
        db.fit_predict(train_cust)
        labels = db.labels_
        print("eps = ", e, "min_samples = ", m)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

eps =  0.25 min_samples =  3
Silhouette Coefficient: -0.471
eps =  0.25 min_samples =  4
Silhouette Coefficient: -0.437
eps =  0.25 min_samples =  5
Silhouette Coefficient: -0.279
eps =  0.25 min_samples =  6
Silhouette Coefficient: -0.244
eps =  0.5 min_samples =  3
Silhouette Coefficient: -0.458
eps =  0.5 min_samples =  4
Silhouette Coefficient: -0.416
eps =  0.5 min_samples =  5
Silhouette Coefficient: -0.398
eps =  0.5 min_samples =  6
Silhouette Coefficient: -0.370
eps =  0.75 min_samples =  3
Silhouette Coefficient: -0.235
eps =  0.75 min_samples =  4
Silhouette Coefficient: -0.206
eps =  0.75 min_samples =  5
Silhouette Coefficient: -0.168
eps =  0.75 min_samples =  6
Silhouette Coefficient: -0.108
eps =  1 min_samples =  3
Silhouette Coefficient: -0.092
eps =  1 min_samples =  4
Silhouette Coefficient: -0.122
eps =  1 min_samples =  5
Silhouette Coefficient: -0.107
eps =  1 min_samples =  6
Silhouette Coefficient: -0.167
eps =  1.5 min_samples =  3
Silhouette Coefficient: 0.13

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [32]:
ep = [3.5,3.6,3.7, 3.75, 3.8,3.9, 4]
minsamp = [3, 4, 5, 6]
for e in ep:
    for m in minsamp:
        db = DBSCAN(eps=e, min_samples=m, n_jobs=-1)
        db.fit_predict(train_cust)
        labels = db.labels_
        print("eps = ", e, "min_samp=", m)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_cust, labels, metric='euclidean'))

eps =  3.5 min_samp= 3
Silhouette Coefficient: 0.351
eps =  3.5 min_samp= 4
Silhouette Coefficient: 0.351
eps =  3.5 min_samp= 5
Silhouette Coefficient: 0.351
eps =  3.5 min_samp= 6
Silhouette Coefficient: 0.351
eps =  3.6 min_samp= 3
Silhouette Coefficient: 0.352
eps =  3.6 min_samp= 4
Silhouette Coefficient: 0.352
eps =  3.6 min_samp= 5
Silhouette Coefficient: 0.352
eps =  3.6 min_samp= 6
Silhouette Coefficient: 0.352
eps =  3.7 min_samp= 3
Silhouette Coefficient: 0.352
eps =  3.7 min_samp= 4
Silhouette Coefficient: 0.352
eps =  3.7 min_samp= 5
Silhouette Coefficient: 0.352
eps =  3.7 min_samp= 6
Silhouette Coefficient: 0.352
eps =  3.75 min_samp= 3
Silhouette Coefficient: 0.352
eps =  3.75 min_samp= 4
Silhouette Coefficient: 0.352
eps =  3.75 min_samp= 5
Silhouette Coefficient: 0.352
eps =  3.75 min_samp= 6
Silhouette Coefficient: 0.352
eps =  3.8 min_samp= 3
Silhouette Coefficient: 0.352
eps =  3.8 min_samp= 4
Silhouette Coefficient: 0.352
eps =  3.8 min_samp= 5
Silhouette Coeffici

# Officer Involved Shootings

In [33]:
#create training and testing groups of Shootings data
train_shoot, test_shoot= train_test_split(shoot_scaled, test_size = 0.2, random_state = 11)

## Kmeans 

In [34]:
#KMeans Clustering of Custody data
k_range = range(2,11)

for i in k_range:
    model = KMeans(n_clusters=i,random_state=11)
    pred =model.fit_predict(train_shoot)
    silhouette_avg = silhouette_score(train_shoot,pred)
    print('The number of clusters, %d, and silhouette coefficient is %0.2f'% (i,silhouette_avg))

The number of clusters, 2, and silhouette coefficient is 0.15
The number of clusters, 3, and silhouette coefficient is 0.12
The number of clusters, 4, and silhouette coefficient is 0.13
The number of clusters, 5, and silhouette coefficient is 0.13
The number of clusters, 6, and silhouette coefficient is 0.13
The number of clusters, 7, and silhouette coefficient is 0.13
The number of clusters, 8, and silhouette coefficient is 0.13
The number of clusters, 9, and silhouette coefficient is 0.12
The number of clusters, 10, and silhouette coefficient is 0.12


## Affinity Propagation

In [35]:
# Affinity propagation with default parameters
affPro = AffinityPropagation()
affPro.fit_predict(train_shoot)
centers = affPro.cluster_centers_indices_
labels = affPro.labels_
print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

Silhouette Coefficient: 0.157


#### Tuning 

In [37]:
damp = [0.5, 0.75, 0.95]
conv = [7, 15, 23]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(train_shoot)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

damping =  0.5 convergence_iter =  7
Silhouette Coefficient: 0.157
damping =  0.5 convergence_iter =  15
Silhouette Coefficient: 0.157
damping =  0.5 convergence_iter =  23
Silhouette Coefficient: 0.157
damping =  0.75 convergence_iter =  7
Silhouette Coefficient: 0.142
damping =  0.75 convergence_iter =  15
Silhouette Coefficient: 0.158
damping =  0.75 convergence_iter =  23
Silhouette Coefficient: 0.158
damping =  0.95 convergence_iter =  7


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [45]:
damp = [0.5, 0.75, 0.95]
conv = [7, 15, 23, 50, 75]

for d in damp:
    for c in conv:
        affPro = AffinityPropagation(damping= d, convergence_iter= c)
        affPro.fit_predict(train_shoot)
        centers = affPro.cluster_centers_indices_ 
        labels = affPro.labels_
        print("damping = ", d, "convergence_iter = ",c)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

damping =  0.5 convergence_iter =  7
Silhouette Coefficient: 0.157
damping =  0.5 convergence_iter =  15
Silhouette Coefficient: 0.157
damping =  0.5 convergence_iter =  23
Silhouette Coefficient: 0.157
damping =  0.5 convergence_iter =  50
Silhouette Coefficient: 0.157
damping =  0.5 convergence_iter =  75
Silhouette Coefficient: 0.157
damping =  0.75 convergence_iter =  7
Silhouette Coefficient: 0.142
damping =  0.75 convergence_iter =  15
Silhouette Coefficient: 0.158
damping =  0.75 convergence_iter =  23
Silhouette Coefficient: 0.158
damping =  0.75 convergence_iter =  50
Silhouette Coefficient: 0.158
damping =  0.75 convergence_iter =  75
Silhouette Coefficient: 0.158
damping =  0.95 convergence_iter =  7


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

## Spectral Clustering

In [46]:
# Spectral clustering of Shooting data
for k in range(2,11):
    spect = SpectralClustering(n_clusters=k,random_state=11)
    spect.fit_predict(train_shoot)
    spectlabel = spect.labels_
    score=silhouette_score(train_shoot, spectlabel, metric='euclidean')
    print("The number of clusters, %d, and silhouette coefficient is %0.2f" % (k,score))

The number of clusters, 2, and silhouette coefficient is -0.02
The number of clusters, 3, and silhouette coefficient is -0.02
The number of clusters, 4, and silhouette coefficient is -0.05
The number of clusters, 5, and silhouette coefficient is -0.06
The number of clusters, 6, and silhouette coefficient is -0.08
The number of clusters, 7, and silhouette coefficient is -0.09
The number of clusters, 8, and silhouette coefficient is -0.09
The number of clusters, 9, and silhouette coefficient is -0.09
The number of clusters, 10, and silhouette coefficient is -0.10


#### Tuning 

In [47]:
gam = [0.5, 1.0, 1.5]

for g in gam:
    for k in range(2,11):
        spect = SpectralClustering(n_clusters=k, random_state=11, n_jobs =-1, gamma= g)
        spect.fit_predict(train_shoot)
        spectlabel = spect.labels_
        score= silhouette_score(train_shoot, spectlabel, metric='euclidean')
        print("The number of clusters: %d, gamma: %f, silhouette coefficient: %0.2f" % (k,g,score))

The number of clusters: 2, gamma: 0.500000, silhouette coefficient: 0.00
The number of clusters: 3, gamma: 0.500000, silhouette coefficient: 0.00
The number of clusters: 4, gamma: 0.500000, silhouette coefficient: -0.02
The number of clusters: 5, gamma: 0.500000, silhouette coefficient: -0.03
The number of clusters: 6, gamma: 0.500000, silhouette coefficient: -0.06
The number of clusters: 7, gamma: 0.500000, silhouette coefficient: -0.07
The number of clusters: 8, gamma: 0.500000, silhouette coefficient: -0.07
The number of clusters: 9, gamma: 0.500000, silhouette coefficient: -0.06
The number of clusters: 10, gamma: 0.500000, silhouette coefficient: -0.06
The number of clusters: 2, gamma: 1.000000, silhouette coefficient: -0.02
The number of clusters: 3, gamma: 1.000000, silhouette coefficient: -0.02
The number of clusters: 4, gamma: 1.000000, silhouette coefficient: -0.05
The number of clusters: 5, gamma: 1.000000, silhouette coefficient: -0.06
The number of clusters: 6, gamma: 1.000

## Agglomerative Clustering

In [48]:
# Agglomerative Clustering with 2-10 clusters 
for k in range(2,11):
    agg = AgglomerativeClustering(n_clusters=k)
    agg.fit_predict(train_shoot)
    labels = agg.labels_
    score= silhouette_score(train_shoot, labels, metric='euclidean')
    print("The number of clusters: %d, silhouette coefficient: %0.2f" % (k,score))

The number of clusters: 2, silhouette coefficient: 0.09
The number of clusters: 3, silhouette coefficient: 0.09
The number of clusters: 4, silhouette coefficient: 0.10
The number of clusters: 5, silhouette coefficient: 0.11
The number of clusters: 6, silhouette coefficient: 0.11
The number of clusters: 7, silhouette coefficient: 0.11
The number of clusters: 8, silhouette coefficient: 0.11
The number of clusters: 9, silhouette coefficient: 0.11
The number of clusters: 10, silhouette coefficient: 0.10


## DBScan

#### Tuning

In [53]:
ep = [0.25, 0.5, 0.75]
minsamp = [3, 5, 7]
leaf = [15, 30, 45]

for e in ep:
    for m in minsamp:
        for l in leaf:
            db = DBSCAN(eps=e, min_samples=m, leaf_size=l, n_jobs=-1)
            db.fit_predict(train_shoot)
            labels = bdb.labels_
            print("eps = ", e, "min_samples = ", m, "leaf_size = ",l)
            print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

eps =  0.25 min_samples =  3 leaf_size =  15
Silhouette Coefficient: -0.143
eps =  0.25 min_samples =  3 leaf_size =  30
Silhouette Coefficient: -0.143
eps =  0.25 min_samples =  3 leaf_size =  45
Silhouette Coefficient: -0.143
eps =  0.25 min_samples =  5 leaf_size =  15


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [62]:
ep = [0.25, 0.5, 0.75, 1, 2, 3, 4, 5, 6, 7, 8]
minsamp = [2, 3, 4]

for e in ep:
    for m in minsamp:
        db = DBSCAN(eps=e, min_samples=m, n_jobs=-1)
        db.fit_predict(train_shoot)
        labels = db.labels_
        print("eps = ", e, "min_samples = ", m)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

eps =  0.25 min_samples =  2
Silhouette Coefficient: -0.126
eps =  0.25 min_samples =  3
Silhouette Coefficient: -0.143
eps =  0.25 min_samples =  4
Silhouette Coefficient: -0.083
eps =  0.5 min_samples =  2
Silhouette Coefficient: -0.164
eps =  0.5 min_samples =  3
Silhouette Coefficient: -0.143
eps =  0.5 min_samples =  4
Silhouette Coefficient: -0.083
eps =  0.75 min_samples =  2
Silhouette Coefficient: -0.194
eps =  0.75 min_samples =  3
Silhouette Coefficient: -0.162
eps =  0.75 min_samples =  4
Silhouette Coefficient: -0.143
eps =  1 min_samples =  2
Silhouette Coefficient: -0.188
eps =  1 min_samples =  3
Silhouette Coefficient: -0.191
eps =  1 min_samples =  4
Silhouette Coefficient: -0.155
eps =  2 min_samples =  2
Silhouette Coefficient: -0.096
eps =  2 min_samples =  3
Silhouette Coefficient: -0.099
eps =  2 min_samples =  4
Silhouette Coefficient: -0.041
eps =  3 min_samples =  2
Silhouette Coefficient: 0.421
eps =  3 min_samples =  3
Silhouette Coefficient: 0.421
eps =  3 

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [63]:
ep = [0.25, 0.5, 0.75, 1, 2, 3, 4]
minsamp = [2, 3, 4]

for e in ep:
    for m in minsamp:
        db = DBSCAN(eps=e, min_samples=m, n_jobs=-1)
        db.fit_predict(train_shoot)
        labels = db.labels_
        print("eps = ", e, "min_samples = ", m)
        print("Silhouette Coefficient: %0.3f"% silhouette_score(train_shoot, labels, metric='euclidean'))

eps =  0.25 min_samples =  2
Silhouette Coefficient: -0.126
eps =  0.25 min_samples =  3
Silhouette Coefficient: -0.143
eps =  0.25 min_samples =  4
Silhouette Coefficient: -0.083
eps =  0.5 min_samples =  2
Silhouette Coefficient: -0.164
eps =  0.5 min_samples =  3
Silhouette Coefficient: -0.143
eps =  0.5 min_samples =  4
Silhouette Coefficient: -0.083
eps =  0.75 min_samples =  2
Silhouette Coefficient: -0.194
eps =  0.75 min_samples =  3
Silhouette Coefficient: -0.162
eps =  0.75 min_samples =  4
Silhouette Coefficient: -0.143
eps =  1 min_samples =  2
Silhouette Coefficient: -0.188
eps =  1 min_samples =  3
Silhouette Coefficient: -0.191
eps =  1 min_samples =  4
Silhouette Coefficient: -0.155
eps =  2 min_samples =  2
Silhouette Coefficient: -0.096
eps =  2 min_samples =  3
Silhouette Coefficient: -0.099
eps =  2 min_samples =  4
Silhouette Coefficient: -0.041
eps =  3 min_samples =  2
Silhouette Coefficient: 0.421
eps =  3 min_samples =  3
Silhouette Coefficient: 0.421
eps =  3 