## imports

In [1]:
from csv import reader
import numpy as np
from sklearn.cluster import KMeans
from itertools import izip
import csv

## Read from CSV

In [2]:
fileValues = []

In [3]:
def readInValuesFromCSV():
    # open csv file and append to a list of dictionaries
    infile = open("NYPD_7_Major_Felony_Incidents.csv", 'r')
    first = True
    for line in reader( infile ):
        if not first:
            OBJECTID,Identifier,Occurrence_Date,Day_of_Week,Occurrence_Month,Occurrence_Day,Occurrence_Year,Occurrence_Hour,CompStat_Month,CompStat_Day,CompStat_Year,Offense,Offense_Classification,Sector,Precinct,Borough,Jurisdiction,XCoordinate,YCoordinate,Location_1 = line
            
            values = {'OBJECTID' : OBJECTID,'Identifier':Identifier,'Occurrence Date':Occurrence_Date,'Day of Week':Day_of_Week,'Occurrence Month':Occurrence_Month,'Occurrence Day':Occurrence_Day,'Occurrence Year':Occurrence_Year,'Occurrence Hour':Occurrence_Hour,'CompStat Month':CompStat_Month,'CompStat Day':CompStat_Day,'CompStat Year':CompStat_Year,'Offense':Offense,'Offense Classification':Offense_Classification,'Sector':Sector,'Precinct':Precinct,'Borough':Borough,'Jurisdiction':Jurisdiction,'XCoordinate':XCoordinate,'YCoordinate':YCoordinate,'Location 1':Location_1 }
            
            fileValues.append(values)
            
        first = False

In [4]:
readInValuesFromCSV()

## Clean the data, cast string coord values to floats

In [6]:
def coordToFloats(coord):
    tmp = coord.replace("(", "")
    tmp = tmp.replace(",","")
    tmp = tmp.replace(")","").split()
    tmp[0] = float(tmp[0])
    tmp[1] = float(tmp[1])
    return tmp
    

## Create CoordinateLists for every felony

In [7]:
dataFrom2015 = []

burglaryCoord = []
felonyAssaultCoord = []
grandLarcenyCoord = []
grandLarcenyVehicleCoord = []
murderCoord = []
rapeCoord = []
robberyCoord = []
for crime in fileValues:
    if crime['Occurrence Year'] == '2015':
        dataFrom2015.append(crime)
        if crime['Offense'] == "BURGLARY":
            burglaryCoord.append(coordToFloats(crime['Location 1']))
        if crime['Offense'] == "FELONY ASSAULT":
            felonyAssaultCoord.append(coordToFloats(crime['Location 1']))
        if crime['Offense'] == "GRAND LARCENY":
            grandLarcenyCoord.append(coordToFloats(crime['Location 1']))
        if crime['Offense'] == "GRAND LARCENY OF MOTOR VEHICLE":
            grandLarcenyVehicleCoord.append(coordToFloats(crime['Location 1']))
        if crime['Offense'] == "MURDER & NON-NEGL. MANSLAUGHTE":
            murderCoord.append(coordToFloats(crime['Location 1']))
        if crime['Offense'] == "RAPE":
            rapeCoord.append(coordToFloats(crime['Location 1']))
        if crime['Offense'] == "ROBBERY":
            robberyCoord.append(coordToFloats(crime['Location 1']))

## Create a list of the offenses and count the occurrences

In [82]:
offenseTypes = []
for crime in dataFrom2015:
    offenseTypes.append(crime['Offense'])

from collections import Counter
counterList = Counter(offenseTypes)

In [84]:
counterList

Counter({'BURGLARY': 14967,
         'FELONY ASSAULT': 20189,
         'GRAND LARCENY': 41873,
         'GRAND LARCENY OF MOTOR VEHICLE': 7250,
         'MURDER & NON-NEGL. MANSLAUGHTE': 336,
         'RAPE': 1156,
         'ROBBERY': 16886})

## Method to export the centroid to a csv for d3.js

In [79]:
def centroid(filename, coord):
    with open(filename,'w') as csvFile:
        fieldnames = ['Cluster','lat','lon']
        writer = csv.DictWriter(csvFile,fieldnames=fieldnames)
        
        writer.writeheader()
        for i, c in enumerate(coord.cluster_centers_):
            writer.writerow({'Cluster':i,'lat':c[0],'lon':c[1]})

## Method to Calculate the K-means for various cluster sizes 

In [85]:
# calculate k-means for different cluster value
def kMeansCalculation(offenseCoord,offenseName):

    X = np.asarray(offenseCoord)
    km2 = KMeans(n_clusters=2,init='random',n_init=10,max_iter=10,tol=1e-04,random_state=0)
    y_km2 = km2.fit_predict(X)
    score_km2 = km2.score(X)
    print offenseName + " km2_Score : " + str(score_km2)
    
    km3 = KMeans(n_clusters=3,init='random',n_init=10,max_iter=10,tol=1e-04,random_state=0)
    y_km3 = km3.fit_predict(X)
    score_km2 = km3.score(X)
    print offenseName + " km3_Score : " + str(score_km2)


    km4 = KMeans(n_clusters=4,init='random',n_init=10,max_iter=10,tol=1e-04,random_state=0)
    y_km4 = km4.fit_predict(X)
    score_km2 = km4.score(X)
    print offenseName + " km4_Score : " + str(score_km2)

    km5 = KMeans(n_clusters=5,init='random',n_init=10,max_iter=10,tol=1e-04,random_state=0)
    y_km5 = km5.fit_predict(X)
    score_km2 = km5.score(X)
    print offenseName + " km5_Score : " + str(score_km2)

    km6 = KMeans(n_clusters=6,init='random',n_init=10,max_iter=10,tol=1e-04,random_state=0)
    y_km6 = km6.fit_predict(X)
    score_km2 = km6.score(X)
    print offenseName + " km6_Score : " + str(score_km2)

    y2=np.array(y_km2).tolist()
    y3=np.array(y_km3).tolist()
    y4=np.array(y_km4).tolist()
    y5=np.array(y_km5).tolist()
    y6=np.array(y_km6).tolist()
    
    
    coordX = []
    coordY = []
    
    for coord in offenseCoord:
        coordX.append(coord[0])
        coordY.append(coord[1])
    
    
    with open( offenseName + '_' +'Kmeans_2.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(['Cluster','lat','lon'])
        writer.writerows(izip(y2,coordX,coordY))
    
    with open( offenseName + '_' +'Kmeans_3.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(['Cluster','lat','lon'])
        writer.writerows(izip(y3,coordX,coordY))

    with open( offenseName + '_' +'Kmeans_4.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(['Cluster','lat','lon'])
        writer.writerows(izip(y4,coordX,coordY))

    with open( offenseName + '_' +'Kmeans_5.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(['Cluster','lat','lon'])
        writer.writerows(izip(y5,coordX,coordY))

    with open( offenseName + '_' +'Kmeans_6.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(['Cluster','lat','lon'])
        writer.writerows(izip(y6,coordX,coordY))
        
        
    centroid(offenseName + "_" + "CentroidKMeans2.csv",km2)
    centroid(offenseName + "_" + "CentroidKMeans3.csv",km3)
    centroid(offenseName + "_" + "CentroidKMeans4.csv",km4)
    centroid(offenseName + "_" + "CentroidKMeans5.csv",km5)
    centroid(offenseName + "_" + "CentroidKMeans6.csv",km6)

## Run every felonyCoordinates to above method and print out the K-Means Score as well

In [86]:
kMeansCalculation(murderCoord,"murder")

murder km2_Score : -2.1381153651
murder km3_Score : -1.170375265
murder km4_Score : -0.817616076436
murder km5_Score : -0.61639381889
murder km6_Score : -0.509958447918


In [63]:
kMeansCalculation(burglaryCoord,"burglary")

burglary km2_Score : -110.245350472
burglary km3_Score : -69.3748448174
burglary km4_Score : -49.3965977285
burglary km5_Score : -37.0342490919
burglary km6_Score : -29.5177071585


In [64]:
kMeansCalculation(rapeCoord,"rape")

rape km2_Score : -7.31571827893
rape km3_Score : -4.43090079423
rape km4_Score : -3.43850211165
rape km5_Score : -2.78440528421
rape km6_Score : -2.23403313459


In [65]:
kMeansCalculation(robberyCoord,"robbery")

robbery km2_Score : -106.047074855
robbery km3_Score : -61.9933198455
robbery km4_Score : -47.2522935207
robbery km5_Score : -36.6795792843
robbery km6_Score : -30.1715074095


In [66]:
kMeansCalculation(grandLarcenyCoord,"grandLarceny")

grandLarceny km2_Score : -279.367623533
grandLarceny km3_Score : -188.312974717
grandLarceny km4_Score : -126.827273745
grandLarceny km5_Score : -90.7827417046
grandLarceny km6_Score : -75.9956931312


In [67]:
kMeansCalculation(grandLarcenyVehicleCoord,"grandLarcenyVehicle")

grandLarcenyVehicle km2_Score : -52.6655508686
grandLarcenyVehicle km3_Score : -30.1437282039
grandLarcenyVehicle km4_Score : -22.2210826223
grandLarcenyVehicle km5_Score : -17.4556373196
grandLarcenyVehicle km6_Score : -14.2989409671


In [68]:
kMeansCalculation(felonyAssaultCoord,"felonyAssault")

felonyAssault km2_Score : -136.909306375
felonyAssault km3_Score : -80.1702665396
felonyAssault km4_Score : -58.9250215792
felonyAssault km5_Score : -47.1841105485
felonyAssault km6_Score : -39.6861137547
