# K-Modes Clustering 

This file does k-modes clustering for each year from 2001 to 2016. 
* https://github.com/nicodv/kmodes 

In [1]:
import pandas as pd 
import numpy as np 
import math
from kmodes.kmodes import KModes

In [2]:
%config IPCompleter.greedy=True 

# Importing Data 

The data should come from the crimesInChicagoData folder from the 'dataset.csv' file.

In [16]:
data = pd.read_csv("../../../crimesInChicagoData/dataset.csv", error_bad_lines = False)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,District,Year,Latitude,Longitude,Month,Day,Hour,Weekday
0,0,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,4,2004.0,0.0,0.0,1,1,0,3
1,1,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,9,2003.0,41.8172,-87.637328,3,1,0,5
2,2,1752,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,False,14,2004.0,0.0,0.0,6,20,11,6
3,3,840,THEFT,FINANCIAL ID THEFT: OVER $300,OTHER,False,False,25,2004.0,0.0,0.0,12,30,20,3
4,4,841,THEFT,FINANCIAL ID THEFT:$300 &UNDER,RESIDENCE,False,False,22,2003.0,41.6918,-87.635116,5,1,1,3


In [17]:
data = data.drop(['Unnamed: 0'], axis = 1)

In [18]:
data.head()

Unnamed: 0,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,District,Year,Latitude,Longitude,Month,Day,Hour,Weekday
0,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,4,2004.0,0.0,0.0,1,1,0,3
1,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,9,2003.0,41.8172,-87.637328,3,1,0,5
2,1752,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,False,14,2004.0,0.0,0.0,6,20,11,6
3,840,THEFT,FINANCIAL ID THEFT: OVER $300,OTHER,False,False,25,2004.0,0.0,0.0,12,30,20,3
4,841,THEFT,FINANCIAL ID THEFT:$300 &UNDER,RESIDENCE,False,False,22,2003.0,41.6918,-87.635116,5,1,1,3


In [7]:
data["Year"].value_counts()

2008.000000    852053
2006.000000    794684
2009.000000    783900
2010.000000    700691
2007.000000    621848
2001.000000    568517
2002.000000    490879
2003.000000    475913
2005.000000    455811
2004.000000    388205
2011.000000    352066
2012.000000    335670
2013.000000    306703
2014.000000    274527
2016.000000    265462
2015.000000    262995
2017.000000     11357
41.789832           1
Name: Year, dtype: int64

In [13]:
def filterYear(year):
    df = finalDf[finalDf.Year==year]
    return df

In [114]:
def makeClusters(data, year, numClusters):
    km = KModes(n_clusters=numClusters, init = "Cao", n_init = 1, verbose=1)
    subsetDf = data.loc[data['Year'] == year].drop(['IUCR', 'Primary Type', 'Year', 'Latitude', 'Longitude'], axis = 1)
    subsetData = subsetDf.values
    fitClusters = km.fit_predict(subsetData)
    clusterCentroidsDf = pd.DataFrame(km.cluster_centroids_)
    clusterCentroidsDf.columns = subsetDf.columns
    
    return fitClusters, clusterCentroidsDf

In [153]:
def labelsDf(year, num, originalData, clusterInfo):
    df = originalData.loc[originalData['Year'] == year]
    df = df.reset_index()
    clustersDf = pd.DataFrame(clusterInfo)
    clustersDf.columns = ['clusters_'+str(num)]
    combinedDf = pd.concat([df, clustersDf], axis = 1).reset_index()
    combinedDf = combinedDf.drop(['index', 'level_0'], axis = 1)
    return combinedDf

In [131]:
clusterData = data

In [132]:
clusterData['District'] = pd.DataFrame(clusterData['District']).applymap(str)

In [133]:
clusterData.head()

Unnamed: 0,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,District,Year,Latitude,Longitude,Month,Day,Hour,Weekday
0,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,4.0,2004.0,0.0,0.0,1,1,0,3
1,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,9.0,2003.0,41.8172,-87.637328,3,1,0,5
2,1752,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,False,14.0,2004.0,0.0,0.0,6,20,11,6
3,840,THEFT,FINANCIAL ID THEFT: OVER $300,OTHER,False,False,25.0,2004.0,0.0,0.0,12,30,20,3
4,841,THEFT,FINANCIAL ID THEFT:$300 &UNDER,RESIDENCE,False,False,22.0,2003.0,41.6918,-87.635116,5,1,1,3


In [141]:
clusterData2001 = makeClusters(clusterData, 2001, 10)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 3072983.0


In [142]:
clusterData2001[0]

array([1, 1, 0, ..., 1, 1, 5], dtype=uint8)

In [143]:
clusterData2001[1]

Unnamed: 0,Description,Location Description,Arrest,Domestic,District,Month,Day,Hour,Weekday
0,SIMPLE,STREET,False,False,2.0,1,12,18,4
1,$500 AND UNDER,RESIDENCE,False,False,8.0,2,1,20,3
2,OVER $500,APARTMENT,False,False,9.0,3,7,21,2
3,TO VEHICLE,SIDEWALK,False,False,25.0,7,2,12,0
4,$500 AND UNDER,STREET,False,False,25.0,3,3,19,5
5,TO PROPERTY,OTHER,False,False,12.0,6,19,22,1
6,SIMPLE,RESIDENCE,False,False,6.0,3,26,22,0
7,OVER $500,STREET,False,False,11.0,2,19,15,0
8,AUTOMOBILE,PARKING LOT/GARAGE(NON.RESID.),False,False,7.0,5,6,14,6
9,TO VEHICLE,STREET,False,False,8.0,8,7,17,1


In [154]:
combined = labelsDf(2001, 10, clusterData, clusterData2001[0])

In [155]:
combined.head()

Unnamed: 0,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,District,Year,Latitude,Longitude,Month,Day,Hour,Weekday,clusters_10
0,266,CRIM SEXUAL ASSAULT,PREDATORY,RESIDENCE,True,True,5.0,2001.0,41.687,-87.608445,1,1,11,0,1
1,1753,OFFENSE INVOLVING CHILDREN,SEX ASSLT OF CHILD BY FAM MBR,RESIDENCE,True,False,8.0,2001.0,41.7915,-87.729099,1,1,0,0,1
2,840,THEFT,FINANCIAL ID THEFT: OVER $300,OTHER,False,False,9.0,2001.0,41.8345,-87.682642,1,1,0,0,0
3,1754,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,True,False,7.0,2001.0,41.7785,-87.674036,6,16,8,5,1
4,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,RESIDENCE,False,False,15.0,2001.0,41.8968,-87.762472,6,1,8,4,1


In [156]:
print(len(combined))

568517


# Creating Clusters and Exporting Data
The clusters and centroids are exported to the 'Clusterings' folder which is where the ClusteringOrganized.ipynb code is in. 

In [157]:
combined.to_csv("clusters2001_10.csv")

In [158]:
clusterData2001[1].to_csv("centroids2001_10.csv")

In [159]:
for year in range(2002, 2017, 1): 
    clusterDataYear = makeClusters(clusterData, year, 10)
    combined = labelsDf(year, 10, clusterData, clusterDataYear[0])
    combined.to_csv("clusters"+str(year)+"_10.csv")
    clusterDataYear[1].to_csv("centroids"+str(year)+"_10.csv")
    print("finished year "+str(year))

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 19095, cost: 2697926.0
Run 1, iteration: 2/100, moves: 0, cost: 2697926.0
finished year 2002
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 2611221.0
finished year 2003
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 9609, cost: 2114365.0
Run 1, iteration: 2/100, moves: 0, cost: 2114365.0
finished year 2004
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 16579, cost: 2517475.0
Run 1, iteration: 2/100, moves: 0, cost: 2517475.0
finished year 2005
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 53536, cost: 4359650.0
Run 1, iteration: 2/100, moves: 0, cost: 4359650.0
finished year 2006
Init: initializing centroids
In