# Clustering Comparison Analysis

We run a clustering model on some subset of the data and measure the similarity of the produced clusterings.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gp
%matplotlib inline

In [2]:
# Read in data
data = pd.read_csv('../data/features/merged.csv')

## Create training data

In [None]:
# From here on out, we'll use a separate variable "training_data" for filtered data. 
# It's good not to overwrite the original 'data' variable
# in case we want to recover the filtered columns later.
training_data = data.copy()

### ... by filtering columns

In [3]:
# [ REPLACE THIS CODE TO DROP COLUMNS IF NECESSARY ]

# demo_columns = ['COMPLAINTS_BLACK','COMPLAINTS_HISPANIC','COMPLAINTS_WHITE','ISR_BLACK','ISR_WHITE','ISR_HISPANIC','UOF_HISPANIC','UOF_BLACK','UOF_WHITE','CENSUS_WHITE','CENSUS_BLACK','CENSUS_HISPANIC','CENSUS_MEDIAN INCOME', 'CENSUS_TOTAL POP']
# training_data = training_data.drop(demo_columns, axis=1)

### .. by aggregating rows

For model selection, since we're eventually clustering by beat, we'll aggregate by summing all years together.


In [4]:
# [ REPLACE THIS CODE TO DROP ROWS IF NECESSARY ]

# training_data = training_data.drop('YEAR',axis=1).groupby(by='BEAT').agg(np.sum).reset_index(drop=True)

## Preprocess / Transform Data

In [5]:
from sklearn.preprocessing import StandardScaler
# Normalize columns
scaler = StandardScaler()
scaled_training_data = pd.DataFrame(scaler.fit_transform(training_data),\
    columns=training_data.columns)

In [6]:
# Do PCA
from util_clustering import generate_pca_data
N_TOP_PCA_COMPONENTS = 6 # See Model Selection.ipynb for why we chose this number.
pca_training_data, pca = generate_pca_data(scaled_training_data, N_TOP_PCA_COMPONENTS)

## Setup Best Model & Hyperparameters

In [7]:
from sklearn.cluster import KMeans

PARAM_N_CLUSTERS = 7 # See Model Selection.ipynb for why we chose this number.
cluster_model_params = {'n_clusters':PARAM_N_CLUSTERS}
cluster_model = KMeans()
cluster_model.set_params(**cluster_model_params)

KMeans(n_clusters=7)

## Run the model

In [9]:
cluster_labels = cluster_model.fit_predict(pca_training_data)
clustered_data = pd.concat([pca_training_data, pd.Series(cluster_labels, name='Cluster')], axis=1)

In [10]:
clustered_data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,Cluster
0,2.154153,-0.556139,1.204775,-1.169982,2.916138,-0.345154,2
1,-0.712959,-0.397213,0.825376,-0.792141,2.122832,-0.394202,1
2,-0.662858,-0.397982,0.398048,0.343189,1.169731,-0.530594,1
3,-0.783367,-0.285063,0.855373,0.000778,1.045543,-0.769667,1
4,-2.824287,0.650939,0.038874,0.125945,0.573302,-0.183643,3
