In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
import itertools

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.19.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  23.58 MB/s
geopy-1.19.0-p 100% |################################| Time: 0:00:00  35.68 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  13.00 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  32.61 MB/s
vincent-0.4.4- 100% |###################

In [2]:
#Download and Explore Dataset

#the link to the dataset: https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-2018-to-Present/wg3w-h783
!wget -q -O 'sanfranpd_data.json' https://data.sfgov.org/resource/nwbb-fxkq.json

print('Data downloaded!')

Data downloaded!


In [3]:
with open('sanfranpd_data.json') as json_data:

    SF_data = json.load(json_data)

In [4]:
SF_data

[{'filed_online': True,
  'incident_category': 'Malicious Mischief',
  'incident_code': '28150',
  'incident_date': '2019-03-15T00:00:00.000',
  'incident_datetime': '2019-03-15T00:00:00.000',
  'incident_day_of_week': 'Friday',
  'incident_description': 'Malicious Mischief, Vandalism to Property',
  'incident_id': '781497',
  'incident_number': '196055090',
  'incident_subcategory': 'Vandalism',
  'incident_time': '00:00',
  'incident_year': '2019',
  'police_district': 'Central',
  'report_datetime': '2019-03-15T16:41:00.000',
  'report_type_code': 'II',
  'report_type_description': 'Coplogic Initial',
  'resolution': 'Open or Active',
  'row_id': '78149728150'},
 {'filed_online': True,
  'incident_category': 'Larceny Theft',
  'incident_code': '06372',
  'incident_date': '2019-03-12T00:00:00.000',
  'incident_datetime': '2019-03-12T13:30:00.000',
  'incident_day_of_week': 'Tuesday',
  'incident_description': 'Theft, Other Property, $50-$200',
  'incident_id': '781547',
  'incident_n

In [5]:
pd.read_json("sanfranpd_data.json").to_excel("output.xlsx")

sanfran_data = pd.read_excel("output.xlsx")
sanfran_data.shape

(1000, 33)

In [6]:
#Remove records for the year 2019
sanfran_data = sanfran_data[sanfran_data.incident_year != 2019]
sanfran_data.shape

(23, 33)

In [7]:
sanfran_data.columns

Index([':@computed_region_26cr_cadq', ':@computed_region_2dwj_jsy4',
       ':@computed_region_6qbp_sg9q', ':@computed_region_ajp5_b2md',
       ':@computed_region_h4ep_8xdi', ':@computed_region_nqbw_i6c3',
       ':@computed_region_qgnn_b9vv', 'analysis_neighborhood', 'cad_number',
       'cnn', 'filed_online', 'incident_category', 'incident_code',
       'incident_date', 'incident_datetime', 'incident_day_of_week',
       'incident_description', 'incident_id', 'incident_number',
       'incident_subcategory', 'incident_time', 'incident_year',
       'intersection', 'latitude', 'longitude', 'point', 'police_district',
       'report_datetime', 'report_type_code', 'report_type_description',
       'resolution', 'row_id', 'supervisor_district'],
      dtype='object')

In [8]:
#Only select useful columns into a new dataframe
newsfdata = sanfran_data[['analysis_neighborhood','incident_category', 'incident_code',
       'incident_date', 'incident_datetime', 'incident_day_of_week',
       'incident_description', 'incident_id', 'incident_number',
       'incident_subcategory', 'incident_time', 'incident_year',
       'intersection', 'latitude', 'longitude', 'point', 'police_district',
       'report_datetime', 'report_type_code', 'report_type_description',
       'resolution', 'row_id', 'supervisor_district']].copy()

In [9]:
#remove columns with NaN values in analysis_neighborhood, latitude and longitude columns
newsfdata=newsfdata.dropna(subset=['analysis_neighborhood', 'latitude', 'longitude'])
newsfdata.head()

Unnamed: 0,analysis_neighborhood,incident_category,incident_code,incident_date,incident_datetime,incident_day_of_week,incident_description,incident_id,incident_number,incident_subcategory,incident_time,incident_year,intersection,latitude,longitude,point,police_district,report_datetime,report_type_code,report_type_description,resolution,row_id,supervisor_district
161,North Beach,Larceny Theft,6374,2018-10-08T00:00:00.000,2018-10-08T18:50:00.000,Monday,"Theft, Other Property, >$950",781592,180763574,Larceny Theft - Other,2019-04-05 18:50:00,2018,STOCKTON ST \ NORTH POINT ST,37.806963,-122.410498,"{'coordinates': [-122.410497554147, 37.8069629...",Central,2019-03-16T11:57:00.000,IS,Coplogic Supplement,Open or Active,78159206374,3.0
167,Mission,Assault,4134,2018-12-01T00:00:00.000,2018-12-01T00:00:00.000,Saturday,Battery,781664,190199298,Simple Assault,2019-04-05 00:00:00,2018,22ND ST \ POTRERO AVE,37.756834,-122.406699,"{'coordinates': [-122.406699002688, 37.7568337...",Mission,2019-03-20T12:10:00.000,II,Initial,Open or Active,78166404134,9.0
169,Mission,Motor Vehicle Theft,7026,2018-12-13T00:00:00.000,2018-12-13T13:45:00.000,Thursday,"Vehicle, Stolen, Other Vehicle",781824,186287043,Motor Vehicle Theft,2019-04-05 13:45:00,2018,16TH ST \ CAPP ST,37.76511,-122.418698,"{'coordinates': [-122.418698426997, 37.7651099...",Mission,2019-03-20T22:15:00.000,VI,Vehicle Initial,Open or Active,78182407026,9.0
202,Noe Valley,Lost Property,71000,2018-12-29T00:00:00.000,2018-12-29T20:00:00.000,Saturday,Lost Property,781513,196055426,Lost Property,2019-04-05 20:00:00,2018,HILL ST \ CHURCH ST,37.75569,-122.427813,"{'coordinates': [-122.427813059383, 37.7556902...",Mission,2019-03-19T13:41:00.000,II,Coplogic Initial,Open or Active,78151371000,8.0
227,Mission,Burglary,5013,2018-05-24T00:00:00.000,2018-05-24T15:00:00.000,Thursday,"Burglary, Apartment House, Unlawful Entry",781330,180399339,Burglary - Residential,2019-04-05 15:00:00,2018,23RD ST \ HARRISON ST,37.75423,-122.412047,"{'coordinates': [-122.412047267025, 37.7542303...",Mission,2019-03-19T15:55:00.000,IS,Initial Supplement,Open or Active,78133005013,9.0


In [17]:
#Lets check which police district has the most incidents.

newsfdata.police_district.value_counts()




Richmond      5
Mission       4
Central       2
Park          1
Southern      1
Tenderloin    1
Bayview       1
Northern      1
Name: police_district, dtype: int64

In [18]:
#Lets check which incident_category has the most occurences.

newsfdata.incident_category.value_counts()

Larceny Theft                               5
Assault                                     3
Fraud                                       2
Offences Against The Family And Children    1
Non-Criminal                                1
Lost Property                               1
Motor Vehicle Theft                         1
Sex Offense                                 1
Burglary                                    1
Name: incident_category, dtype: int64

In [25]:
#Create boolean to filter for the police district Richmond which has the most number of crime incidents
is_Richmond =  newsfdata['police_district']=='Richmond'
print(is_Richmond.head())

161    False
167    False
169    False
202    False
227    False
Name: police_district, dtype: bool


In [28]:
#get data for all neighborhoods in the police district of Richmond by creating a new dataframe for Richmond applying the boolean variable
Richmonddata = newsfdata[is_Richmond]
Richmonddata.head()

Unnamed: 0,analysis_neighborhood,incident_category,incident_code,incident_date,incident_datetime,incident_day_of_week,incident_description,incident_id,incident_number,incident_subcategory,incident_time,incident_year,intersection,latitude,longitude,point,police_district,report_datetime,report_type_code,report_type_description,resolution,row_id,supervisor_district
373,Outer Richmond,Larceny Theft,6302,2018-11-23T00:00:00.000,2018-11-23T00:00:00.000,Friday,"Theft, From Building, $50-$200",781692,190199470,Larceny Theft - From Building,2019-04-05 00:00:00,2018,21ST AVE \ BALBOA ST,37.776498,-122.480175,"{'coordinates': [-122.480175348582, 37.7764976...",Richmond,2019-03-20T13:20:00.000,II,Initial,Open or Active,78169206302,1.0
530,Outer Richmond,Offences Against The Family And Children,15500,2018-10-01T00:00:00.000,2018-10-01T00:00:00.000,Monday,Juvenile Involved (secondary code),781258,190195779,Other,2019-04-05 00:00:00,2018,FULTON ST \ LA PLAYA,37.771396,-122.509895,"{'coordinates': [-122.509894751097, 37.7713960...",Richmond,2019-03-19T06:41:00.000,II,Initial,Unfounded,78125815500,1.0
650,Outer Richmond,Fraud,9320,2018-10-26T00:00:00.000,2018-10-26T00:00:00.000,Friday,"Access Card, incl. Credit, Phone, ATM, Fraudul...",782102,190202396,Fraud,2019-04-05 00:00:00,2018,ANZA ST \ 36TH AVE,37.777628,-122.496379,"{'coordinates': [-122.496379420441, 37.7776282...",Richmond,2019-03-21T15:45:00.000,II,Initial,Open or Active,78210209320,1.0
824,Presidio Heights,Larceny Theft,6304,2018-12-27T00:00:00.000,2018-12-27T21:30:00.000,Thursday,"Theft, From Building, >$950",782160,190203190,Larceny Theft - From Building,2019-04-05 21:30:00,2018,PACIFIC AVE \ PRESIDIO AVE \ PRESIDIO BLVD,37.791703,-122.447679,"{'coordinates': [-122.447679209625, 37.7917028...",Richmond,2019-03-21T19:39:00.000,II,Initial,Open or Active,78216006304,2.0
890,Outer Richmond,Assault,4144,2018-10-01T00:00:00.000,2018-10-01T00:00:00.000,Monday,"Battery, Sexual",781258,190195779,Simple Assault,2019-04-05 00:00:00,2018,FULTON ST \ LA PLAYA,37.771396,-122.509895,"{'coordinates': [-122.509894751097, 37.7713960...",Richmond,2019-03-19T06:41:00.000,II,Initial,Unfounded,78125804144,1.0


In [29]:
#Lets check which analysis_neighborhood in the Richmond police district has the most incidents.

Richmonddata.analysis_neighborhood.value_counts()

Outer Richmond      4
Presidio Heights    1
Name: analysis_neighborhood, dtype: int64

In [10]:
#Find the geographical co-ordinates for San Francisco
address = 'San Francisco, California'

geolocator = Nominatim(user_agent="sf_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of San Francisco are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of San Francisco are 37.7792808, -122.4192363.


In [34]:
#let's visualize San Francisco with all the analysis_neighborhoods in it.

map_SF = folium.Map(location=[latitude, longitude], zoom_start=13)

In [36]:
# add markers to map

for lat, lng, label in zip(newsfdata['latitude'], newsfdata['longitude'], newsfdata['analysis_neighborhood']):

    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(

        [lat, lng],

        radius=5,

        popup=label,

        color='blue',

        fill=True,

        fill_color='#3186cc',

        fill_opacity=0.7,

        parse_html=False).add_to(map_SF)  

    

map_SF

In [31]:
#Let's get the geographical coordinates of Outer Richmond in the police district of Richmond since it shows the most incidents.
address = 'Outer Richmond, SF'

geolocator = Nominatim(user_agent="sf_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Outer Richmond, SF are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Outer Richmond, SF are 37.7806432, -122.4725964.


In [33]:
#Create boolean to filter for the analysis_neighborhood Outer Richmond which has the most number of crime incidents
is_ORichmond =  Richmonddata['analysis_neighborhood']=="Outer Richmond"
#get data for all Outer Richmond by creating a new dataframe for Outer Richmond applying the boolean variable
ORichmonddata = Richmonddata[is_ORichmond]
ORichmonddata.head()

Unnamed: 0,analysis_neighborhood,incident_category,incident_code,incident_date,incident_datetime,incident_day_of_week,incident_description,incident_id,incident_number,incident_subcategory,incident_time,incident_year,intersection,latitude,longitude,point,police_district,report_datetime,report_type_code,report_type_description,resolution,row_id,supervisor_district
373,Outer Richmond,Larceny Theft,6302,2018-11-23T00:00:00.000,2018-11-23T00:00:00.000,Friday,"Theft, From Building, $50-$200",781692,190199470,Larceny Theft - From Building,2019-04-05,2018,21ST AVE \ BALBOA ST,37.776498,-122.480175,"{'coordinates': [-122.480175348582, 37.7764976...",Richmond,2019-03-20T13:20:00.000,II,Initial,Open or Active,78169206302,1.0
530,Outer Richmond,Offences Against The Family And Children,15500,2018-10-01T00:00:00.000,2018-10-01T00:00:00.000,Monday,Juvenile Involved (secondary code),781258,190195779,Other,2019-04-05,2018,FULTON ST \ LA PLAYA,37.771396,-122.509895,"{'coordinates': [-122.509894751097, 37.7713960...",Richmond,2019-03-19T06:41:00.000,II,Initial,Unfounded,78125815500,1.0
650,Outer Richmond,Fraud,9320,2018-10-26T00:00:00.000,2018-10-26T00:00:00.000,Friday,"Access Card, incl. Credit, Phone, ATM, Fraudul...",782102,190202396,Fraud,2019-04-05,2018,ANZA ST \ 36TH AVE,37.777628,-122.496379,"{'coordinates': [-122.496379420441, 37.7776282...",Richmond,2019-03-21T15:45:00.000,II,Initial,Open or Active,78210209320,1.0
890,Outer Richmond,Assault,4144,2018-10-01T00:00:00.000,2018-10-01T00:00:00.000,Monday,"Battery, Sexual",781258,190195779,Simple Assault,2019-04-05,2018,FULTON ST \ LA PLAYA,37.771396,-122.509895,"{'coordinates': [-122.509894751097, 37.7713960...",Richmond,2019-03-19T06:41:00.000,II,Initial,Unfounded,78125804144,1.0


In [38]:
#let's visualize Outer Richmond with the incidents in it.

# create map of Outer Richmond using latitude and longitude values

map_OR = folium.Map(location=[latitude, longitude], zoom_start=11)



# add markers to map

for lat, lng, label in zip(ORichmonddata['latitude'], ORichmonddata['longitude'], ORichmonddata['incident_category']):

    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(

        [lat, lng],

        radius=5,

        popup=label,

        color='blue',

        fill=True,

        fill_color='#3186cc',

        fill_opacity=0.7,

        parse_html=False).add_to(map_OR)  

    

map_OR

In [41]:
#reindexed dataframe
R_newsfdata=newsfdata.reset_index(drop=True)

In [42]:
R_newsfdata

Unnamed: 0,analysis_neighborhood,incident_category,incident_code,incident_date,incident_datetime,incident_day_of_week,incident_description,incident_id,incident_number,incident_subcategory,incident_time,incident_year,intersection,latitude,longitude,point,police_district,report_datetime,report_type_code,report_type_description,resolution,row_id,supervisor_district
0,North Beach,Larceny Theft,6374,2018-10-08T00:00:00.000,2018-10-08T18:50:00.000,Monday,"Theft, Other Property, >$950",781592,180763574,Larceny Theft - Other,2019-04-05 18:50:00,2018,STOCKTON ST \ NORTH POINT ST,37.806963,-122.410498,"{'coordinates': [-122.410497554147, 37.8069629...",Central,2019-03-16T11:57:00.000,IS,Coplogic Supplement,Open or Active,78159206374,3.0
1,Mission,Assault,4134,2018-12-01T00:00:00.000,2018-12-01T00:00:00.000,Saturday,Battery,781664,190199298,Simple Assault,2019-04-05 00:00:00,2018,22ND ST \ POTRERO AVE,37.756834,-122.406699,"{'coordinates': [-122.406699002688, 37.7568337...",Mission,2019-03-20T12:10:00.000,II,Initial,Open or Active,78166404134,9.0
2,Mission,Motor Vehicle Theft,7026,2018-12-13T00:00:00.000,2018-12-13T13:45:00.000,Thursday,"Vehicle, Stolen, Other Vehicle",781824,186287043,Motor Vehicle Theft,2019-04-05 13:45:00,2018,16TH ST \ CAPP ST,37.76511,-122.418698,"{'coordinates': [-122.418698426997, 37.7651099...",Mission,2019-03-20T22:15:00.000,VI,Vehicle Initial,Open or Active,78182407026,9.0
3,Noe Valley,Lost Property,71000,2018-12-29T00:00:00.000,2018-12-29T20:00:00.000,Saturday,Lost Property,781513,196055426,Lost Property,2019-04-05 20:00:00,2018,HILL ST \ CHURCH ST,37.75569,-122.427813,"{'coordinates': [-122.427813059383, 37.7556902...",Mission,2019-03-19T13:41:00.000,II,Coplogic Initial,Open or Active,78151371000,8.0
4,Mission,Burglary,5013,2018-05-24T00:00:00.000,2018-05-24T15:00:00.000,Thursday,"Burglary, Apartment House, Unlawful Entry",781330,180399339,Burglary - Residential,2019-04-05 15:00:00,2018,23RD ST \ HARRISON ST,37.75423,-122.412047,"{'coordinates': [-122.412047267025, 37.7542303...",Mission,2019-03-19T15:55:00.000,IS,Initial Supplement,Open or Active,78133005013,9.0
5,Potrero Hill,Fraud,9024,2018-06-19T00:00:00.000,2018-06-19T18:00:00.000,Tuesday,"Fraudulent Game or Trick, Obtaining Money or P...",781388,190197775,Fraud,2019-04-05 18:00:00,2018,ARKANSAS ST \ 22ND ST,37.757359,-122.39797,"{'coordinates': [-122.397970345008, 37.7573592...",Bayview,2019-03-19T19:49:00.000,II,Initial,Open or Active,78138809024,10.0
6,Castro/Upper Market,Larceny Theft,6301,2018-07-01T00:00:00.000,2018-07-01T12:00:00.000,Sunday,"Theft, From Building, <$50",781766,190197452,Larceny Theft - From Building,2019-04-05 12:00:00,2018,URANUS TER \ 17TH ST \ ROOSEVELT WAY,37.76188,-122.445309,"{'coordinates': [-122.445309436312, 37.7618804...",Park,2019-03-19T15:53:00.000,II,Initial,Open or Active,78176606301,8.0
7,Outer Richmond,Larceny Theft,6302,2018-11-23T00:00:00.000,2018-11-23T00:00:00.000,Friday,"Theft, From Building, $50-$200",781692,190199470,Larceny Theft - From Building,2019-04-05 00:00:00,2018,21ST AVE \ BALBOA ST,37.776498,-122.480175,"{'coordinates': [-122.480175348582, 37.7764976...",Richmond,2019-03-20T13:20:00.000,II,Initial,Open or Active,78169206302,1.0
8,Outer Richmond,Offences Against The Family And Children,15500,2018-10-01T00:00:00.000,2018-10-01T00:00:00.000,Monday,Juvenile Involved (secondary code),781258,190195779,Other,2019-04-05 00:00:00,2018,FULTON ST \ LA PLAYA,37.771396,-122.509895,"{'coordinates': [-122.509894751097, 37.7713960...",Richmond,2019-03-19T06:41:00.000,II,Initial,Unfounded,78125815500,1.0
9,Outer Richmond,Fraud,9320,2018-10-26T00:00:00.000,2018-10-26T00:00:00.000,Friday,"Access Card, incl. Credit, Phone, ATM, Fraudul...",782102,190202396,Fraud,2019-04-05 00:00:00,2018,ANZA ST \ 36TH AVE,37.777628,-122.496379,"{'coordinates': [-122.496379420441, 37.7776282...",Richmond,2019-03-21T15:45:00.000,II,Initial,Open or Active,78210209320,1.0


In [57]:
R_newsfdata.groupby('incident_category').count()

Unnamed: 0_level_0,analysis_neighborhood,incident_code,incident_date,incident_datetime,incident_day_of_week,incident_description,incident_id,incident_number,incident_subcategory,incident_time,incident_year,intersection,latitude,longitude,point,police_district,report_datetime,report_type_code,report_type_description,resolution,row_id,supervisor_district,Clus_Db
incident_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Assault,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
Burglary,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Fraud,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Larceny Theft,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
Lost Property,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Motor Vehicle Theft,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Non-Criminal,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Offences Against The Family And Children,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Sex Offense,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [64]:
#Analyze Each Neighborhood

# one hot encoding

SF_onehot = pd.get_dummies(R_newsfdata[['incident_category']], prefix="", prefix_sep="")

# add analysis_neighborhood column back to dataframe

SF_onehot['analysis_neighborhood'] = R_newsfdata['analysis_neighborhood'] 

# move analysis_neighborhood column to the first column

fixed_columns = [SF_onehot.columns[-1]] + list(SF_onehot.columns[:-1])

SF_onehot = SF_onehot[fixed_columns]

SF_onehot.head()

Unnamed: 0,analysis_neighborhood,Assault,Burglary,Fraud,Larceny Theft,Lost Property,Motor Vehicle Theft,Non-Criminal,Offences Against The Family And Children,Sex Offense
0,North Beach,0,0,0,1,0,0,0,0,0
1,Mission,1,0,0,0,0,0,0,0,0
2,Mission,0,0,0,0,0,1,0,0,0
3,Noe Valley,0,0,0,0,1,0,0,0,0
4,Mission,0,1,0,0,0,0,0,0,0


In [65]:
SF_onehot.shape

(16, 10)

In [66]:
#Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

SF_grouped = SF_onehot.groupby('analysis_neighborhood').mean().reset_index()

SF_grouped

Unnamed: 0,analysis_neighborhood,Assault,Burglary,Fraud,Larceny Theft,Lost Property,Motor Vehicle Theft,Non-Criminal,Offences Against The Family And Children,Sex Offense
0,Castro/Upper Market,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Hayes Valley,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Mission,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
3,Noe Valley,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,North Beach,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,Outer Richmond,0.25,0.0,0.25,0.25,0.0,0.0,0.0,0.25,0.0
6,Potrero Hill,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Presidio Heights,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,Russian Hill,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,South of Market,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5


In [94]:
#Let's print each neighborhood along with the top 5 most common incidents

num_top_incidents = 5

for hood in SF_grouped['analysis_neighborhood']:

    print("----"+hood+"----")

    temp = SF_grouped[SF_grouped['analysis_neighborhood'] == hood].T.reset_index()

    temp.columns = ['venue','freq']

    temp = temp.iloc[1:]

    temp['freq'] = temp['freq'].astype(float)

    temp = temp.round({'freq': 2})

    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_incidents))

    print('\n')
	

----Castro/Upper Market----
           venue  freq
0  Larceny Theft   1.0
1        Assault   0.0
2       Burglary   0.0
3          Fraud   0.0
4  Lost Property   0.0


----Hayes Valley----
           venue  freq
0        Assault   1.0
1       Burglary   0.0
2          Fraud   0.0
3  Larceny Theft   0.0
4  Lost Property   0.0


----Mission----
                 venue  freq
0              Assault  0.33
1             Burglary  0.33
2  Motor Vehicle Theft  0.33
3                Fraud  0.00
4        Larceny Theft  0.00


----Noe Valley----
           venue  freq
0  Lost Property   1.0
1        Assault   0.0
2       Burglary   0.0
3          Fraud   0.0
4  Larceny Theft   0.0


----North Beach----
           venue  freq
0  Larceny Theft   1.0
1        Assault   0.0
2       Burglary   0.0
3          Fraud   0.0
4  Lost Property   0.0


----Outer Richmond----
                                      venue  freq
0                                   Assault  0.25
1                                    

In [102]:
#Let's put that into a pandas dataframe

#First, let's write a function to sort the venues in descending order.

def return_most_common_incidents(row, num_top_inc):

    row_categories = row.iloc[1:]

    row_categories_sorted = row_categories.sort_values(ascending=False)

    

    return row_categories_sorted.index.values[0:num_top_inc]

In [104]:
#Now let's create the new dataframe and display the top 5 incidents for each neighborhood.

num_top_inc = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top incidents
columns = ['analysis_neighborhood']
for ind in np.arange(num_top_inc):
    try:
        columns.append('{}{} Most Common Incident'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Incident'.format(ind+1))

# create a new dataframe
neighborhoods_incidents_sorted = pd.DataFrame(columns=columns)
neighborhoods_incidents_sorted['analysis_neighborhood'] = SF_grouped['analysis_neighborhood']

for ind in np.arange(SF_grouped.shape[0]):
    neighborhoods_incidents_sorted.iloc[ind, 1:] = return_most_common_incidents(SF_grouped.iloc[ind, :], num_top_inc)

neighborhoods_incidents_sorted.head()

Unnamed: 0,analysis_neighborhood,1st Most Common Incident,2nd Most Common Incident,3rd Most Common Incident,4th Most Common Incident,5th Most Common Incident
0,Castro/Upper Market,Larceny Theft,Sex Offense,Offences Against The Family And Children,Non-Criminal,Motor Vehicle Theft
1,Hayes Valley,Assault,Sex Offense,Offences Against The Family And Children,Non-Criminal,Motor Vehicle Theft
2,Mission,Motor Vehicle Theft,Burglary,Assault,Sex Offense,Offences Against The Family And Children
3,Noe Valley,Lost Property,Sex Offense,Offences Against The Family And Children,Non-Criminal,Motor Vehicle Theft
4,North Beach,Larceny Theft,Sex Offense,Offences Against The Family And Children,Non-Criminal,Motor Vehicle Theft


In [115]:
#Cluster Neighborhoods

#Run k-means to cluster the neighborhood into 5 clusters.

# set number of clusters

kclusters = 5


In [116]:

SF_grouped_clustering = SF_grouped.drop('analysis_neighborhood', 1)


In [117]:
# run k-means clustering

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(SF_grouped_clustering)

In [118]:
# check cluster labels generated for each row in the dataframe

kmeans.labels_[0:10] 


array([1, 3, 3, 4, 1, 3, 2, 1, 1, 0], dtype=int32)

In [119]:
#Let's create a new dataframe that includes the cluster as well as the top 10 incidents for each neighborhood.

# add clustering labels
neighborhoods_incidents_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

SF_merged = R_newsfdata


ValueError: cannot insert Cluster Labels, already exists

In [113]:
SF_merged.head()

Unnamed: 0,analysis_neighborhood,incident_category,incident_code,incident_date,incident_datetime,incident_day_of_week,incident_description,incident_id,incident_number,incident_subcategory,incident_time,incident_year,intersection,latitude,longitude,point,police_district,report_datetime,report_type_code,report_type_description,resolution,row_id,supervisor_district,Clus_Db
0,North Beach,Larceny Theft,6374,2018-10-08T00:00:00.000,2018-10-08T18:50:00.000,Monday,"Theft, Other Property, >$950",781592,180763574,Larceny Theft - Other,2019-04-05 18:50:00,2018,STOCKTON ST \ NORTH POINT ST,37.806963,-122.410498,"{'coordinates': [-122.410497554147, 37.8069629...",Central,2019-03-16T11:57:00.000,IS,Coplogic Supplement,Open or Active,78159206374,3.0,-1
1,Mission,Assault,4134,2018-12-01T00:00:00.000,2018-12-01T00:00:00.000,Saturday,Battery,781664,190199298,Simple Assault,2019-04-05 00:00:00,2018,22ND ST \ POTRERO AVE,37.756834,-122.406699,"{'coordinates': [-122.406699002688, 37.7568337...",Mission,2019-03-20T12:10:00.000,II,Initial,Open or Active,78166404134,9.0,-1
2,Mission,Motor Vehicle Theft,7026,2018-12-13T00:00:00.000,2018-12-13T13:45:00.000,Thursday,"Vehicle, Stolen, Other Vehicle",781824,186287043,Motor Vehicle Theft,2019-04-05 13:45:00,2018,16TH ST \ CAPP ST,37.76511,-122.418698,"{'coordinates': [-122.418698426997, 37.7651099...",Mission,2019-03-20T22:15:00.000,VI,Vehicle Initial,Open or Active,78182407026,9.0,-1
3,Noe Valley,Lost Property,71000,2018-12-29T00:00:00.000,2018-12-29T20:00:00.000,Saturday,Lost Property,781513,196055426,Lost Property,2019-04-05 20:00:00,2018,HILL ST \ CHURCH ST,37.75569,-122.427813,"{'coordinates': [-122.427813059383, 37.7556902...",Mission,2019-03-19T13:41:00.000,II,Coplogic Initial,Open or Active,78151371000,8.0,-1
4,Mission,Burglary,5013,2018-05-24T00:00:00.000,2018-05-24T15:00:00.000,Thursday,"Burglary, Apartment House, Unlawful Entry",781330,180399339,Burglary - Residential,2019-04-05 15:00:00,2018,23RD ST \ HARRISON ST,37.75423,-122.412047,"{'coordinates': [-122.412047267025, 37.7542303...",Mission,2019-03-19T15:55:00.000,IS,Initial Supplement,Open or Active,78133005013,9.0,-1


In [114]:
SF_merged.columns

Index(['analysis_neighborhood', 'incident_category', 'incident_code',
       'incident_date', 'incident_datetime', 'incident_day_of_week',
       'incident_description', 'incident_id', 'incident_number',
       'incident_subcategory', 'incident_time', 'incident_year',
       'intersection', 'latitude', 'longitude', 'point', 'police_district',
       'report_datetime', 'report_type_code', 'report_type_description',
       'resolution', 'row_id', 'supervisor_district', 'Clus_Db'],
      dtype='object')

In [125]:
#Finally, let's visualize the resulting clusters

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(SF_merged['latitude'], SF_merged['longitude'], SF_merged['incident_category'], SF_merged['Clus_Db']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters