In [1]:
import numpy as np
import pandas as pd
import csv
import json
import io
from sklearn.cluster import KMeans
from sklearn.cross_validation import train_test_split
from __future__ import division
import matplotlib.pyplot as plt
import geoplotlib as gpl
from geoplotlib.utils import BoundingBox
%matplotlib inline
pd.set_option("display.max_rows",10)

try:
    to_unicode = unicode
except NameError:
    to_unicode = str

In [2]:
# Importing data set
filename = 'SFPD_Incidents_Jan_2003.csv'
data = pd.read_csv(filename)
# Dropping observations at latitue 90
data.drop(data[data.Y == 90].index, inplace=True)
data

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023860,981021761,WARRANTS,WARRANT ARREST,Wednesday,09/30/2015,23:53,NORTHERN,"ARREST, BOOKED",OLIVE ST / VANNESS AV,-122.421097,37.784294,"(37.7842937802977, -122.421096796936)",98102176163010
2023861,81076498,MISSING PERSON,FOUND PERSON,Monday,03/28/2016,14:00,TARAVAL,NONE,2300 Block of 24TH AV,-122.481183,37.743727,"(37.7437268170337, -122.48118317163)",8107649875000
2023862,961392021,MISSING PERSON,FOUND PERSON,Wednesday,02/24/2016,00:01,SOUTHERN,EXCEPTIONAL CLEARANCE,800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)",96139202175000
2023863,961392021,NON-CRIMINAL,CASE CLOSURE,Wednesday,02/24/2016,00:01,SOUTHERN,EXCEPTIONAL CLEARANCE,800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)",96139202175030


In [62]:
# Extracting prostitution crimes
focus_crimes = ['PROSTITUTION']
f_crime = data[data.Category.isin(focus_crimes)]
f_crime = f_crime[['Y','X']]
f_crime.columns = ['lat','lon']

# Initializing dataframe
cluster_data = {}
centroids = {}

cluster_range = range(2,7)

for i in cluster_range:
    # Train KMeans on data
    kmeans = KMeans(n_clusters=i, random_state=0).fit(f_crime)
    
    # Save the centroids to an array
    centroid = kmeans.cluster_centers_
    centroid_array = []
    for j in range(0,i):
        centroid_cord = {}
        centroid_cord['lat'] = centroid[j][0]
        centroid_cord['lon'] = centroid[j][1]
        centroid_array.append(centroid_cord)
    centroids['k' + str(i)] = centroid_array
    
    # Predict cluster and append data to dict
    data_array = []
    for index, obs in f_crime.iterrows():
        pred = kmeans.predict(obs.reshape(1,-1))
        f_crime.loc[index,'k' + str(i)] = int(pred)
    
    
model_data = {}
model_data['centroids'] = centroids
model_data['datapoints'] = f_crime.to_dict('records')

In [63]:
# Write JSON file with datapoints
with io.open('model_data.json', 'w', encoding='utf8') as outfile:
    str_ = json.dumps(model_data,
                      indent=4, sort_keys=True,
                      separators=(',', ':'), ensure_ascii=False)
    outfile.write(to_unicode(str_))

