# Choosing The Number of Clusters 

We look at k-modes cost and the centroid labels with 5, 10, 25, and 50 clusters to evaluate which number of clusters is appropriate for extracting important features of crimes. 

In [1]:
import pandas as pd 
import numpy as np 
import math
from kmodes.kmodes import KModes

In [2]:
%config IPCompleter.greedy=True


# Importing Data 
The dataset should be in the crimesInChicagoData folder and the 'dataset.csv' file. 

In [3]:
data = pd.read_csv("../../../crimesInChicagoData/dataset.csv", error_bad_lines = False)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,District,Year,Latitude,Longitude,Month,Day,Hour,Weekday
0,0,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,4,2004.0,0.0,0.0,1,1,0,3
1,1,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,9,2003.0,41.8172,-87.637328,3,1,0,5
2,2,1752,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,False,14,2004.0,0.0,0.0,6,20,11,6
3,3,840,THEFT,FINANCIAL ID THEFT: OVER $300,OTHER,False,False,25,2004.0,0.0,0.0,12,30,20,3
4,4,841,THEFT,FINANCIAL ID THEFT:$300 &UNDER,RESIDENCE,False,False,22,2003.0,41.6918,-87.635116,5,1,1,3


In [5]:
data = data.drop(['Unnamed: 0'], axis = 1)

In [6]:
def filterYear(year):
    df = finalDf[finalDf.Year==year]
    return df

In [7]:
def makeClusters(data, year, numClusters):
    km = KModes(n_clusters=numClusters, init = "Cao", n_init = 1, verbose=1)
    subsetDf = data.loc[data['Year'] == year].drop(['IUCR', 'Primary Type', 'Year', 'Latitude', 'Longitude'], axis = 1)
    subsetData = subsetDf.values
    fitClusters = km.fit_predict(subsetData)
    clusterCentroidsDf = pd.DataFrame(km.cluster_centroids_)
    clusterCentroidsDf.columns = subsetDf.columns
    
    return fitClusters, clusterCentroidsDf

In [8]:
def labelsDf(year, num, originalData, clusterInfo):
    df = originalData.loc[originalData['Year'] == year]
    df = df.reset_index()
    clustersDf = pd.DataFrame(clusterInfo)
    clustersDf.columns = ['clusters_'+str(num)]
    combinedDf = pd.concat([df, clustersDf], axis = 1).reset_index()
    combinedDf = combinedDf.drop(['index', 'level_0'], axis = 1)
    return combinedDf

In [9]:
clusterData = data

In [10]:
clusterData['District'] = pd.DataFrame(clusterData['District']).applymap(str)

In [12]:
clusterData2001_5clusters = makeClusters(clusterData, 2001, 5)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 3247429.0


In [18]:
clusterData2001_5clusters[1]

Unnamed: 0,Description,Location Description,Arrest,Domestic,District,Month,Day,Hour,Weekday
0,SIMPLE,STREET,False,False,2.0,1,12,18,4
1,$500 AND UNDER,RESIDENCE,False,False,8.0,2,1,20,3
2,OVER $500,APARTMENT,False,False,9.0,3,7,21,2
3,TO VEHICLE,SIDEWALK,False,False,25.0,7,2,12,0
4,$500 AND UNDER,STREET,False,False,25.0,3,3,19,5


In [25]:
clusterData2001_10clusters = makeClusters(clusterData, 2001, 10)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 3072983.0


In [26]:
clusterData2001_10clusters[1]

Unnamed: 0,Description,Location Description,Arrest,Domestic,District,Month,Day,Hour,Weekday
0,SIMPLE,STREET,False,False,2.0,1,12,18,4
1,$500 AND UNDER,RESIDENCE,False,False,8.0,2,1,20,3
2,OVER $500,APARTMENT,False,False,9.0,3,7,21,2
3,TO VEHICLE,SIDEWALK,False,False,25.0,7,2,12,0
4,$500 AND UNDER,STREET,False,False,25.0,3,3,19,5
5,TO PROPERTY,OTHER,False,False,12.0,6,19,22,1
6,SIMPLE,RESIDENCE,False,False,6.0,3,26,22,0
7,OVER $500,STREET,False,False,11.0,2,19,15,0
8,AUTOMOBILE,PARKING LOT/GARAGE(NON.RESID.),False,False,7.0,5,6,14,6
9,TO VEHICLE,STREET,False,False,8.0,8,7,17,1


In [19]:
clusterData2001_25clusters = makeClusters(clusterData, 2001, 25)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 7051, cost: 2839372.0
Run 1, iteration: 2/100, moves: 0, cost: 2839372.0


In [20]:
clusterData2001_25clusters[1]

Unnamed: 0,Description,Location Description,Arrest,Domestic,District,Month,Day,Hour,Weekday
0,SIMPLE,STREET,False,False,2.0,1,12,18,4
1,$500 AND UNDER,RESIDENCE,False,False,8.0,2,1,20,3
2,OVER $500,APARTMENT,False,False,9.0,3,7,21,2
3,TO VEHICLE,SIDEWALK,False,False,25.0,7,2,12,0
4,$500 AND UNDER,STREET,False,False,25.0,3,3,19,5
5,TO PROPERTY,OTHER,False,False,12.0,6,19,22,1
6,SIMPLE,RESIDENCE,False,False,6.0,3,26,22,0
7,OVER $500,STREET,False,False,11.0,2,19,15,0
8,AUTOMOBILE,PARKING LOT/GARAGE(NON.RESID.),False,False,7.0,5,6,14,6
9,TO VEHICLE,STREET,False,False,8.0,8,7,17,1


In [22]:
clusterData2001_50clusters = makeClusters(clusterData, 2001, 50)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 15776, cost: 2646485.0
Run 1, iteration: 2/100, moves: 0, cost: 2646485.0


In [23]:
clusterData2001_50clusters[1]

Unnamed: 0,Description,Location Description,Arrest,Domestic,District,Month,Day,Hour,Weekday
0,SIMPLE,STREET,False,False,2.0,1,12,18,4
1,$500 AND UNDER,RESIDENCE,False,False,8.0,2,1,20,3
2,OVER $500,APARTMENT,False,False,9.0,3,7,21,2
3,TO VEHICLE,SIDEWALK,False,False,25.0,7,2,12,0
4,$500 AND UNDER,STREET,False,False,25.0,3,3,19,5
5,TO PROPERTY,OTHER,False,False,12.0,6,19,22,1
6,SIMPLE,RESIDENCE,False,False,6.0,3,26,22,0
7,OVER $500,STREET,False,False,11.0,2,19,15,0
8,AUTOMOBILE,PARKING LOT/GARAGE(NON.RESID.),False,False,7.0,5,6,14,6
9,TO VEHICLE,STREET,False,False,8.0,8,7,17,1


0