## Trailling clustering algorithms (outside of kmeans)

In [2]:
# Import all libraries and packages
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from numpy.random import seed
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
import warnings
from sklearn.metrics import homogeneity_score
%matplotlib inline
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

In [3]:
# import PCA data from R
pca = pd.read_csv('pca_last.csv')

Unnamed: 0,PC1,PC2,PC3,PC4
0,0.553086,0.326666,0.574046,-1.784805
1,1.630674,-0.650065,1.823799,-2.259643
2,0.954668,0.500511,1.714916,-2.23247
3,1.232923,1.12438,1.983232,-1.713557
4,-2.799251,0.054982,1.039514,-0.052945


### 1. Agglomerative clustering

In [12]:
# agglomerative clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot

In [80]:
hc = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward') # Again, 'ward' is specified here to match the linkage in sch.dendrogram.
y_hc = hc.fit_predict(pca)

# Cluster 1
np.where(y_hc == 0)

(array([   0,    1,    2, ..., 7875, 7880, 7881]),)

In [81]:
# calculate silhouette score 
score = silhouette_score(pca, y_hc, metric='euclidean')

In [15]:
score

0.260783091005266

In [82]:
chs = calinski_harabasz_score(pca, y_hc)
chs

2681.5627202291525

### 2. BIRCH

In [57]:
# define the model
from sklearn.cluster import Birch

model = Birch(threshold=0.01, n_clusters=4)

# fit the model
t = model.fit_predict(pca)

# Cluster 1
np.where(t == 0)

(array([   0,    1,    2, ..., 7874, 7875, 7881]),)

In [58]:
# calculate silhouette score 
score = silhouette_score(pca, t, metric="euclidean")

In [59]:
score

0.2593552218547294

In [83]:
chs = calinski_harabasz_score(pca, t)
chs

2677.4207703484803

### 3. DBSCAN

In [61]:
# Trial 4 
# dbscan clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from matplotlib import pyplot

# define the model
model = DBSCAN(eps=0.30, min_samples=9)

# fit model and predict clusters
k = model.fit_predict(pca)

In [62]:
# calculate silhouette score 
score = silhouette_score(pca, k, metric="euclidean")

In [63]:
score

-0.38110368000600997

In [64]:
chs = calinski_harabasz_score(pca, k)
chs

128.412880440353

### 4. MiniBatchKmeans

In [65]:
from sklearn.cluster import MiniBatchKMeans

# define the model
model = MiniBatchKMeans(n_clusters=4)

# fit the model
e = model.fit_predict(pca)

In [66]:
# calculate silhouette score 
score = silhouette_score(pca, e, metric="euclidean")

In [75]:
score

0.2992276048072439

In [68]:
chs = calinski_harabasz_score(pca, e)
chs

3212.868391565942

### 5. Spectral  

In [31]:
from sklearn.cluster import SpectralClustering

In [76]:
# define the model
model = SpectralClustering(n_clusters=4)

# fit model and predict clusters
b = model.fit_predict(pca)

In [77]:
# calculate silhouette score 
score = silhouette_score(pca, b, metric="euclidean")

In [78]:
score

0.3135524110115014

Spectral clustering obtains the highest silhouette score of all algorithms, explore further.

In [79]:
chs = calinski_harabasz_score(pca, b)
chs

1378.4389885633766

In [35]:
# append labels 
pca['Cluster'] = pd.Series(b, index=pca.index)

In [36]:
# get value counts to see how many people are in each cluster
pca['Cluster'].value_counts()

0    7811
2      39
1      34
3       2
Name: Cluster, dtype: int64

Algorithm disregarded due to unequal spread of individuals in each group.