# K-Modes Clustering
* https://github.com/nicodv/kmodes
* paper
* Used for categorical variables
* Looks at data points and checks for matching categories

## Importing Basic Packages

In [1]:
import pandas as pd 
import numpy as np 
import math

In [2]:
# Demonstrates Display of dataframe in a better way 
from IPython.display import display, HTML

# Tells matplotlib to display images inline instead of a new window
%matplotlib inline
import matplotlib.pyplot as plt

## Set Paths for Reading in Centroids

In [4]:
localPath = "./centroids"
fileType = "_10.csv"

## Read Centroids
Read centroids dataframes and combine the centroids of all years from 2001 to 2016

In [5]:
allCentroids  = pd.read_csv(localPath+str(2001)+fileType)
for year in range(2002, 2017, 1):
    datasetFilePath = localPath+str(year)+fileType
    centroidDf = pd.read_csv(datasetFilePath)
    allCentroids = allCentroids.append(centroidDf)

In [6]:
allCentroids = allCentroids.reset_index()

In [8]:
allCentroids

Unnamed: 0.1,index,Unnamed: 0,Description,Location Description,Arrest,Domestic,District,Month,Day,Hour,Weekday
0,0,0,SIMPLE,STREET,False,False,2.0,1,12,18,4
1,1,1,$500 AND UNDER,RESIDENCE,False,False,8.0,2,1,20,3
2,2,2,OVER $500,APARTMENT,False,False,9.0,3,7,21,2
3,3,3,TO VEHICLE,SIDEWALK,False,False,25.0,7,2,12,0
4,4,4,$500 AND UNDER,STREET,False,False,25.0,3,3,19,5
5,5,5,TO PROPERTY,OTHER,False,False,12.0,6,19,22,1
6,6,6,SIMPLE,RESIDENCE,False,False,6.0,3,26,22,0
7,7,7,OVER $500,STREET,False,False,11.0,2,19,15,0
8,8,8,AUTOMOBILE,PARKING LOT/GARAGE(NON.RESID.),False,False,7.0,5,6,14,6
9,9,9,TO VEHICLE,STREET,False,False,8.0,8,7,17,1


## Unique Values of Columns from Combined Centroids
* Location description, months, days, districts, description, weekday, and hour
* Use these unique values of each column to create heatmaps

In [None]:
uniqueLocations = allCentroids['Location Description'].unique()
uniqueLocations

In [None]:
uniqueMonths = sorted(allCentroids['Month'].unique())
uniqueMonths

In [None]:
uniqueDays = sorted(allCentroids['Day'].unique())
uniqueDays

In [None]:
uniqueDistricts = sorted(allCentroids['District'].unique())
uniqueDistricts

In [None]:
uniqueCrimes = allCentroids['Description'].unique()
uniqueCrimes

In [None]:
uniqueWeekdays = sorted(allCentroids['Weekday'].unique())
uniqueWeekdays

In [None]:
uniqueHours = sorted(allCentroids['Hour'].unique())
uniqueHours

## Function for Creating Data to Use for Building Heatmaps
When given a specific column and row name, it returns crime frequency per year and crime frequency of all years data

In [None]:
def bokehScaledDataMaker(df, columnList, rowList, columnName, rowName): 
    bokehData = []
    for year in range(2001, 2017, 1):
        yearDf = df.loc[df['Year'] == year]
        totalNumCrimesYear = len(yearDf)
        for row in rowList:
            for col in columnList:
                cellInfo = {}
                freq = len(yearDf.loc[(yearDf[rowName] == row) & (yearDf[columnName] == col)])
                cellInfo['Year'] = year
                cellInfo[columnName] = col
                cellInfo[rowName] = row 
                cellInfo['Frequency'] =freq
                cellInfo['Year Frequency'] = freq/totalNumCrimesYear
                bokehData.append(cellInfo)  
        print("Done with year..."+str(year))
    
    return bokehData

## Import Package for Creating Heatmaps

In [None]:
import holoviews as hv

In [None]:
hv.extension('bokeh')

## Set Paths for Reading in Clusters

In [None]:
fileTypeClusters= "_10.csv"
localClustersPath = "./clusters"

## Read clusters
Read in clusters from datasets of years 2001 - 2016 and combine them into one

In [None]:
dfTotal = pd.read_csv(localClustersPath+str(2001)+fileTypeClusters)
print(localClustersPath+str(2001)+fileTypeClusters)
for year in range(2002, 2017, 1):
    datasetFilePath = localClustersPath+str(year)+fileTypeClusters
    print(datasetFilePath)
    df = pd.read_csv(datasetFilePath)
    dfTotal = dfTotal.append(df)

## Create Function for Building Heatmaps

In [None]:
def autoMateHeatMap(df, xList, yList, xName, yName):
    tmpData = bokehScaledDataMaker(df, xList,  yList, xName, yName)
    tmpDf = pd.DataFrame(tmpData)
    tmpDf_tokeep = list(tmpDf.columns)
    tableScaled = hv.Table(tmpDf[tmpDf_tokeep], label='Crime Frequency For Year')
    table = hv.Table(tmpDf[tmpDf_tokeep], label= 'Crime Frequency Over All Years')
    
    return tmpDf, tableScaled, table

## Build Heatmaps by Crime Description
* Which districts have the highest number of specific types of crimes by year
* Identify any trends by year

In [None]:
districtDescriptionData = autoMateHeatMap(dfTotal, uniqueDistricts, uniqueCrimes, 'District', 'Description')

In [None]:
%%opts HeatMap (cmap='YlOrRd')  [width=550 height=300 tools=['hover'] colorbar=True toolbar='below' logx=True show_title=True]
%%output size= 98
%%output filename='districtDescription' fig="html"
districtDescriptionData[1].to.heatmap(kdims= ['District',  'Description'], vdims = ['Year Frequency', 'Frequency']) +\
districtDescriptionData[2].to.heatmap(kdims= ['District', 'Description'], vdims = ['Frequency', 'Year Frequency'])

## Build Heatmaps by Location Description
* Which district has more crimes by location description
* Identify any trends by year

In [None]:
districtLocationData = autoMateHeatMap(dfTotal, uniqueDistricts, uniqueLocations, 'District', 'Location Description')

In [None]:
%%opts HeatMap (cmap='YlOrRd')  [width=550 height=300 tools=['hover'] colorbar=True toolbar='below' logx=True show_title=True]
%%output size= 98
%%output filename='districtLocation' fig="html"
districtLocationData[1].to.heatmap(kdims= ['District',  'Location Description'], vdims = ['Year Frequency', 'Frequency']) +\
districtLocationData[2].to.heatmap(kdims= ['District', 'Location Description'], vdims = ['Frequency', 'Year Frequency'])

## Build Heatmaps by Month
* Which month has the highest number of crimes by district 
* Identify any trends by year

In [None]:
districtMonthData = autoMateHeatMap(dfTotal, uniqueDistricts, uniqueMonths, 'District', 'Month')

In [None]:
%%opts HeatMap (cmap='YlOrRd')  [width=550 height=300 tools=['hover'] colorbar=True toolbar='below' logx=True show_title=True]
%%output size= 98
%%output filename='districtMonth' fig="html"
districtMonthData[1].to.heatmap(kdims= ['District',  'Month'], vdims = ['Year Frequency', 'Frequency']) +\
districtMonthData[2].to.heatmap(kdims= ['District', 'Month'], vdims = ['Frequency', 'Year Frequency'])

## Build Heatmaps by Day
* Which days of the month has the most number of crimes by district
* Identify any trends by year

In [None]:
districtDayData = autoMateHeatMap(dfTotal, uniqueDistricts, uniqueDays, 'District', 'Day')

In [None]:
%%opts HeatMap (cmap='YlOrRd')  [width=550 height=300 tools=['hover'] colorbar=True toolbar='below' logx=True show_title=True]
%%output size= 98
%%output filename='districtDay' fig="html"
districtDayData[1].to.heatmap(kdims= ['District',  'Day'], vdims = ['Year Frequency', 'Frequency']) +\
districtDayData[2].to.heatmap(kdims= ['District', 'Day'], vdims = ['Frequency', 'Year Frequency'])

## Build Heatmaps by Day of the Week
* Which day of the week has the most number of crimes by district
* Identify any trends by year

In [None]:
districtWeekdayData = autoMateHeatMap(dfTotal, uniqueDistricts, uniqueWeekdays, 'District', 'Weekday')

In [None]:
%%opts HeatMap (cmap='YlOrRd')  [width=550 height=300 tools=['hover'] colorbar=True toolbar='below' logx=True show_title=True]
%%output size= 98
%%output filename='districtWeekday' fig="html"
districtWeekdayData[1].to.heatmap(kdims= ['District',  'Weekday'], vdims = ['Year Frequency', 'Frequency']) +\
districtWeekdayData[2].to.heatmap(kdims= ['District', 'Weekday'], vdims = ['Frequency', 'Year Frequency'])

## Build Heatmaps by Hour 
* Which times of the day have the msot number of crimes by district
* Identify any trends by year

In [None]:
districtHourData = autoMateHeatMap(dfTotal, uniqueDistricts, uniqueHours, 'District', 'Hour')

In [None]:
%%opts HeatMap (cmap='YlOrRd')  [width=550 height=300 tools=['hover'] colorbar=True toolbar='below' logx=True show_title=True]
%%output size= 98
%%output filename='districtHour' fig="html"
districtHourData[1].to.heatmap(kdims= ['District',  'Hour'], vdims = ['Year Frequency', 'Frequency']) +\
districtHourData[2].to.heatmap(kdims= ['District', 'Hour'], vdims = ['Frequency', 'Year Frequency'])