In [3]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx

In [157]:
# Read in data
data = pd.read_csv('../data/features/merged.csv')

In [158]:
# Subset on a year to reduce ovrelap/muddiness
data = data[data['YEAR'] == 2018]

In [159]:
# We don't want to use beat and year as features
# We also don't want district or sector
training_data = data.drop(['BEAT','YEAR','ISR_DISTRICT','ISR_SECTOR', 'UOF_DISTRICT'], axis=1)

In [160]:
from sklearn.preprocessing import StandardScaler
# Normalize columns
# This puts all features at same importance. We may want type or shootings to be more important somehow.
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(training_data), columns=training_data.columns)

In [161]:
from sklearn.decomposition import PCA
# Do PCA
pca = PCA()
pca.fit(data_scaled)
data_pc = pd.DataFrame(pca.transform(data_scaled))

In [191]:
import datetime
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
def grid_search_clustering(model, param_grid, metric, data):
    results = pd.DataFrame(columns=["Model", "Params","Score","VRScore","DBScore","SilScore","Time", "Labels"])
    for params in param_grid: 
        # Train the clustering model
        start = datetime.datetime.now()
        print("Training model with:", params)
        model.set_params(**params)
        labels = model.fit_predict(data)
        stop = datetime.datetime.now()
        print("Training Time Elapsed:", stop - start)
        # Compute user-specified score of the clustering quality
        if type(metric) == str:
            score = getattr(model, metric)
        else:
            score = np.nan
        # Compute common cluster quality scores
        try:
            vr_score = calinski_harabasz_score(data, labels)
        except:
            # returns one cluster
            vr_score = np.nan
        try:
            sil_score = silhouette_score(data, labels)
        except:
            # returns one cluster
            sil_score = np.nan
        try:
            db_score = davies_bouldin_score(data, labels)
        except:
            # returns one cluster
            db_score = np.nan
        # Save results
        results = results.append({ \
                        "Model": type(model).__name__, \
                        "Params":str(params), \
                        "Score": score, \
                        "VRScore": vr_score, \
                        "DBScore": db_score, \
                        "SilScore": sil_score, \
                        "Time": stop-start, \
                        "Labels": labels}, \
                        ignore_index=True)

    print("Grid search completed.")
    return results

In [192]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import ParameterGrid

params_km = {'n_clusters':[2,3,4,5,7,9]}
km_result = grid_search_clustering(KMeans(), ParameterGrid(params_km), 'inertia_', data_scaled)        

params_hagg = {'n_clusters':[2,3,4,5,7,9], 'affinity':['euclidean','cosine'], 'linkage':['complete','average','single']}
hagg_result = grid_search_clustering(AgglomerativeClustering(), ParameterGrid(params_hagg), None, data_scaled)        

params_dbscan = {'eps':[.01,.1,.2,.5,.9], 'min_samples':[2, 5, 10, 100], 'metric':['euclidean','cosine']}
dbscan_result = grid_search_clustering(DBSCAN(), ParameterGrid(params_dbscan), None, data_scaled)        

params_gauss = {'n_components':[2,3,4,5,7,9], 'n_init':[1,10]}
gauss_result = grid_search_clustering(GaussianMixture(), ParameterGrid(params_gauss), 'lower_bound_', data_scaled)        

params_spectral = {'n_clusters':[2,3,4,5,7,9], 'affinity':['rbf','nearest_neighbors'], 'n_neighbors':[2, 5, 10, 100]}
spectral_result = grid_search_clustering(SpectralClustering(), ParameterGrid(params_spectral), None, data_scaled)        

results = pd.concat([km_result, hagg_result, dbscan_result, gauss_result, spectral_result])

Training model with: {'n_clusters': 2}
Training Time Elapsed: 0:00:00.031085
Training model with: {'n_clusters': 3}
Training Time Elapsed: 0:00:00.037700
Training model with: {'n_clusters': 4}
Training Time Elapsed: 0:00:00.040435
Training model with: {'n_clusters': 5}
Training Time Elapsed: 0:00:00.038790
Training model with: {'n_clusters': 7}
Training Time Elapsed: 0:00:00.054938
Training model with: {'n_clusters': 9}
Training Time Elapsed: 0:00:00.066297
Grid search completed.
Training model with: {'affinity': 'euclidean', 'linkage': 'complete', 'n_clusters': 2}
Training Time Elapsed: 0:00:00.008048
Training model with: {'affinity': 'euclidean', 'linkage': 'complete', 'n_clusters': 3}
Training Time Elapsed: 0:00:00.006639
Training model with: {'affinity': 'euclidean', 'linkage': 'complete', 'n_clusters': 4}
Training Time Elapsed: 0:00:00.007528
Training model with: {'affinity': 'euclidean', 'linkage': 'complete', 'n_clusters': 5}
Training Time Elapsed: 0:00:00.007146
Training model 

In [210]:
# Apply learned labels to data frame
best_labels_km = km_result.sort_values(by='Score', ascending=False).head(1)['Labels'].to_numpy()[0]
best_labels_gauss = gauss_result.sort_values(by='Score', ascending=False).head(1)['Labels'].to_numpy()[0]
best_labels_dbscore = results.sort_values(by='DBScore', ascending=True).head(1)['Labels'].to_numpy()[0]
best_labels_vrscore = results.sort_values(by='VRScore', ascending=True).head(1)['Labels'].to_numpy()[0]
best_labels_silscore = results.sort_values(by='SilScore', ascending=False).head(1)['Labels'].to_numpy()[0]
clustered = pd.concat([data, \
                        pd.Series(best_labels_km, name='KMeans'), \
                        pd.Series(best_labels_gauss, name='Gauss'), \
                        pd.Series(best_labels_dbscore, name='Davies'), \
                        pd.Series(best_labels_vrscore, name='VarRatio'), \
                        pd.Series(best_labels_silscore, name='Silo'), \
                        ], axis=1)

In [225]:
# Hack for importing sibling modules
### SNIPPET ATTRIBUTION: https://izziswift.com/import-local-function-from-a-module-housed-in-another-directory-with-relative-imports-in-jupyter-notebook-using-python-3/
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
### END SNIPPET 

In [226]:
from util.cat_reshaper import listcolumn_pivot_longer

pie_plt_data = pd.concat([clustered['BEAT'], pd.get_dummies(clustered['KMeans'], prefix='Cluster')], axis=1).groupby(by='BEAT').agg('sum')
pie_plt_data.columns

tmp = listcolumn_pivot_longer(pie_plt_data, 'Cluster', 'Cluster_')

In [227]:
tmp

Unnamed: 0_level_0,Cluster
BEAT,Unnamed: 1_level_1
111.0,[0.0]
112.0,[0.0]
113.0,"[0.0, 1.0]"
114.0,[0.0]
121.0,[0.0]
...,...
2531.0,[]
2532.0,[]
2533.0,[]
2534.0,[]
