## Toronto Crime Analysis

### This notebook analyzes crime trends over time from 2014 to 2017

In [1]:
%matplotlib notebook

In [2]:
import requests
import json
import pandas as pd
from matplotlib import pyplot as plt
from pprint import pprint
from pandas.io.json import json_normalize
from IPython.display import clear_output
from scipy import stats

In [3]:
# read crime data
crimes_df = pd.DataFrame(pd.read_csv('../output/crimes_csv.csv', index_col=0))
crimes_df.head()

Unnamed: 0,Hood_ID,Index_,MCI,Neighbourhood,event_unique_id,occurrencedate,occurrenceday,occurrencedayofweek,occurrencedayofyear,occurrencehour,occurrencemonth,occurrenceyear,offence,premisetype
0,84,976,Assault,Little Portugal (84),GO-2015942420,1388552400000,1,Wednesday,1,0,January,2014,Assault,Apartment
1,25,977,Assault,Glenfield-Jane Heights (25),GO-20162295518,1388552400000,1,Wednesday,1,0,January,2014,Assault,Apartment
2,71,978,Assault,Cabbagetown-South St.James Town (71),GO-20142458487,1388552400000,1,Wednesday,1,0,January,2014,Assault,Apartment
3,71,979,Assault,Cabbagetown-South St.James Town (71),GO-20142458487,1388552400000,1,Wednesday,1,0,January,2014,Assault,Apartment
4,58,980,Theft Over,Old East York (58),GO-20141573896,1388552400000,1,Wednesday,1,0,January,2014,Theft Over,House


In [4]:
# group by year and month
crimes_per_month = crimes_df.groupby(['occurrenceyear', 'occurrencemonth'])['event_unique_id'].count()
crimes_per_month_df = crimes_per_month.to_frame().reset_index()
crimes_per_month_df.head()

Unnamed: 0,occurrenceyear,occurrencemonth,event_unique_id
0,2014,April,2507
1,2014,August,2760
2,2014,December,2472
3,2014,February,2192
4,2014,January,2430


In [5]:
# make column that is the first 3 letters of month plus year 
crimes_per_month_df['Month'] = crimes_per_month_df['occurrencemonth'] + " " + crimes_per_month_df['occurrenceyear'].map(str)

In [6]:
crimes_per_month_df.head()

Unnamed: 0,occurrenceyear,occurrencemonth,event_unique_id,Month
0,2014,April,2507,April 2014
1,2014,August,2760,August 2014
2,2014,December,2472,December 2014
3,2014,February,2192,February 2014
4,2014,January,2430,January 2014


In [7]:
# to sort by months, use catagories
Months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

In [8]:
crimes_per_month_df['occurrencemonth'] = pd.Categorical(crimes_per_month_df['occurrencemonth'], categories=Months, ordered=True)

In [9]:
# sort by year and month
crimes_per_month_df_sorted = crimes_per_month_df.sort_values(['occurrenceyear', 'occurrencemonth'])
crimes_per_month_df_sorted.head()

Unnamed: 0,occurrenceyear,occurrencemonth,event_unique_id,Month
4,2014,January,2430,January 2014
3,2014,February,2192,February 2014
7,2014,March,2418,March 2014
0,2014,April,2507,April 2014
8,2014,May,2854,May 2014


In [10]:
# extract counts and month names while sorted:
counts_per_month = crimes_per_month_df_sorted['event_unique_id']
month = crimes_per_month_df_sorted['Month']

In [46]:
# get columns and rename
plt.plot(month, counts_per_month)
plt.xticks(rotation=90, fontsize=6)
plt.title('Crimes Over Time')
plt.ylabel('Crime count')
plt.xlabel('Month')
plt.tight_layout()
plt.savefig('../output/total_crimes_over_time')
plt.show()

<IPython.core.display.Javascript object>

### Crimes always dip significantly in Jan-Feb of each year
### Upward trend overall :(


In [12]:
# lets see per neighborhood

In [13]:
# group by neighborhood first, thenyear and month
crimes_per_hood = crimes_df.groupby(['Neighbourhood', 'occurrenceyear', 'occurrencemonth'])['event_unique_id'].count()
crimes_per_hood_df = crimes_per_hood.to_frame().reset_index()
crimes_per_hood_df.head()

Unnamed: 0,Neighbourhood,occurrenceyear,occurrencemonth,event_unique_id
0,Agincourt North (129),2014,April,19
1,Agincourt North (129),2014,August,15
2,Agincourt North (129),2014,December,14
3,Agincourt North (129),2014,February,17
4,Agincourt North (129),2014,January,20


In [14]:
crimes_per_hood_df['Month'] = crimes_per_hood_df['occurrencemonth'] + " " + crimes_per_hood_df['occurrenceyear'].map(str)

In [15]:
crimes_per_hood_df['occurrencemonth'] = pd.Categorical(crimes_per_hood_df['occurrencemonth'], categories=Months, ordered=True)

In [16]:
# sort by neighborhood, then year and month
crimes_per_hood_df_sorted = crimes_per_hood_df.sort_values(['Neighbourhood','occurrenceyear', 'occurrencemonth'])
crimes_per_hood_df_sorted.head()

Unnamed: 0,Neighbourhood,occurrenceyear,occurrencemonth,event_unique_id,Month
4,Agincourt North (129),2014,January,20,January 2014
3,Agincourt North (129),2014,February,17,February 2014
7,Agincourt North (129),2014,March,12,March 2014
0,Agincourt North (129),2014,April,19,April 2014
8,Agincourt North (129),2014,May,6,May 2014


In [17]:
neighbourhoods = crimes_per_hood_df_sorted['Neighbourhood'].unique()
len(neighbourhoods)

140

In [18]:
# extract series for each neighbourhood:
counts={}
months={}
for neighbourhood in neighbourhoods:
    counts[neighbourhood] = crimes_per_hood_df_sorted['event_unique_id'][crimes_per_hood_df_sorted['Neighbourhood']==neighbourhood]
    months[neighbourhood] = crimes_per_hood_df_sorted['Month'][crimes_per_hood_df_sorted['Neighbourhood']==neighbourhood]

In [19]:
plt.clf()
for neighbourhood in neighbourhoods:
    plt.plot(months[neighbourhood], counts[neighbourhood], label=neighbourhood)

plt.xticks(rotation=90, fontsize=6)
plt.title('Crimes Over Time')
plt.ylabel('Crime count')
plt.xlabel('Month')
plt.tight_layout()
plt.legend()
plt.show()

In [28]:
# too many to see clearly
# lets get the neighborhoods that have the most significant change over time
changes={}
for neighbourhood in neighbourhoods:
    changes[neighbourhood] = ( counts[neighbourhood].max() - counts[neighbourhood].min() ) 

In [31]:
# sort the values of change
sorted_changes = sorted ( list(changes.values()), reverse=True)

In [40]:
# get top 10
high_changes = sorted_changes[:5]
high_changes

[110, 87, 57, 56, 53]

In [41]:
# get which neighborhoods
neighbourhoods_of_interest = [ neighbourhood for neighbourhood in neighbourhoods if changes[neighbourhood] in high_changes ]

In [45]:
# plot only these
plt.clf()
for neighbourhood in neighbourhoods_of_interest:
    plt.plot(months[neighbourhood], counts[neighbourhood], label=neighbourhood)

plt.xticks(rotation=90, fontsize=6)
plt.title('Crimes Over Time Per Neighborhoods')
plt.ylabel('Crime count')
plt.xlabel('Month')
plt.legend()
plt.tight_layout()
plt.savefig('../output/crimes_over_time_for_neighborhoods_with_largest_change')
plt.show()