# Getting started with the SFPD data

In [7]:
# Getting relevant packages
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import urllib2
from collections import Counter 
import numpy as np
from operator import itemgetter
from scipy import linalg
import geoplotlib as gpl

In [2]:
# Importing data using pandas
#crime_data = pd.read_csv('SFPD-Incidents_Jan2003.csv')

### Initial work on data

In [1]:
# Show the resulting crime data
#crime_data

In [None]:
print "Crimes in all:", len(crime_data)

In [None]:
# Extract set of crime categories
crime_categories = set(crime_data.Category)
print "No. of crime categories:", len(crime_categories), "\n", crime_categories

In [None]:
# Count number of occurences of the different categories
crime_count = Counter()
for cat in crime_data.Category:
    crime_count[cat] += 1
crime_count

In [None]:
# Find most common crime
crime_count.most_common(1)

In [None]:
# Find least common crime
least_common = crime_count.most_common()[-1][0]
print least_common + ":", crime_count[crime_count.most_common()[-1][0]]  

### Plot of crimes

In [None]:
# Plot bar chart of crimes
labels, values = zip(*crime_count.most_common())
indexes = np.arange(len(labels))

fig = plt.figure(figsize=(16,8))
plt.bar(indexes, values, align='center', alpha=0.5, color='grey')
plt.xticks(indexes, labels)
plt.ylabel('Number of incidents')
plt.xlabel('Crime Category')
plt.title('Bar chart of crimes committed since 2003')
_, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.show()


### Change over time - crimes committed per year 2003-2016

In [None]:
# get crimes per year
crimePerYear = []
for date in crime_data.Date:
    crimePerYear.append(int(date.split("/")[-1]))

# Append to existing dataframe
crime_data['Year'] = crimePerYear 
crime_data = crime_data.sort_values(by='Year')
crime_data

In [None]:
# Count number of crimes per year
year_count = Counter()
for year in crimePerYear:
    year_count[year] += 1

# Find avg number of crimes per year
print "Average number of crimes per year:", len(crime_data)/(2017-2003)

In [None]:
focuscrimes = set(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT'])

### Plotting for the focus crimes on a yearly basis

In [None]:
# Create 2 X 7 grid for the 14 crime subplots
# Run through each focus crime, aggregate data and plot on-the-fly

# Initialize variables
y_cnt = Counter()
temp_crime_y = []
plotVar = 1 # plotting variable, keeps track of subplot

# Run through each focus crime category
for crime in focuscrimes:
    # Make data structure with one crime category at a time
    temp_values = crime_data.loc[crime_data['Category'].isin([crime])]
    # Extract year per crime, add to list
    temp_crime_y = temp_values.Year.tolist()
    # Count no of crimes for each year
    for y in temp_crime_y:
        y_cnt[y] += 1
    
    # Sort plot values
    labels, values = zip(*sorted(y_cnt.items()))
    indexes = np.arange(len(labels))
    
    # Add plot variables
    plt.subplot(7, 2, plotVar)
    plt.annotate(crime, xy=(1, 0), xycoords='axes fraction',
                 xytext=(-200, 80), textcoords='offset pixels',
                 horizontalalignment='right',verticalalignment='bottom') # Labels on each plot
    plt.bar(indexes, values, align='center', alpha=0.5, color='grey')
    
    # Plot the results
    plt.xticks(indexes, labels)
    plt.ylabel('Crime count')
    plt.xlabel('Year')
    plt.subplots_adjust(bottom=.01, left=.01, right=2.5, top=4.0, hspace=.75)
    plotVar += 1
    
plt.show()


#### Comment on the plots
>*Note!* 2017 have been included but since we're only in February 2017, this data of course is inconsistent.

* For all categories within the focus crimes, it seems as if the years 2003-2005 had increased numbers of crimes. Why this is is hard to tell only from this data. Maybe, as an effect of the 'engine immobilizer technology' driving under the influence and vandalism can have been positiely effected, but this is a long shot.
* 2010-2011 is definately an all time low for most crimes, why I cannot say.
* Vehicle theft, disorderly conduct and vandalism seem to have peaked in 2015.

### Plotting the number of crimes per hour

In [None]:
# Getting the hour from the crime data
crimePerHour = []
for hour in crime_data.Time:
    crimePerHour.append(int(hour.split(":")[0]))

# Append to existing dataframe
crime_data['Hour'] = crimePerHour 
crime_data

In [None]:
# Count crimes per hour
hour_count = Counter()
for hour in crimePerHour:
    hour_count[hour] += 1

In [None]:
# Plot bar chart of crimes
labels, values = zip(*hour_count.items())
indexes = np.arange(len(labels))

fig = plt.figure(figsize=(16,8))
plt.bar(indexes, values, align='center', alpha=0.5, color='grey')
plt.xticks(indexes, labels)
plt.ylabel('Number of incidents')
plt.xlabel('Crime Category')
plt.title('Bar chart of the hour of crimes committed in San Fran since 2003')
_, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.show()


#### Comment on the plot

* Considering the peak at 12 I first thought it was just because it was part of the time where most people are away from home, maybe out for lunch, but I don't feel that this is reason enough to conclude that this is why, especially not since the number of crimes keep rising all the way until 6PM! Are people eating out a lot? *Or maybe they first realize that they have been robbed when they get home (from work etc.)?*
* The quite low number of crimes from 2-7 makes sense since most people are at home, sleeping, and the "crime riders" might even sleep themselves. At least that's what my naïve brain thinks.
* I found [this article](https://www.fastcodesign.com/1664491/infographic-of-the-day-when-do-criminals-prowl-the-streets) describing different theories on when and why different crimes occur at the time they to for some states in the US, an easy read..

In [None]:
# Create 2 X 7 grid for the 14 crime subplots
# Run through each focus crime, aggregate data and plot on-the-fly

# Initialize variables
h_cnt = Counter()
temp_crime_h = []
plotVar = 1 # plotting variable, keeps track of subplot

# Run through each focus crime category
for crime in focuscrimes:
    # Make data structure with one crime category at a time
    temp_values = crime_data.loc[crime_data['Category'].isin([crime])]
    # Extract year per crime, add to list
    temp_crime_h = temp_values.Hour.tolist()
    #print len(temp_crime_h)
    # Count no of crimes for each year
    for h in temp_crime_h:
        h_cnt[h] += 1
    
    # Sort plot values
    labels, values = zip(*sorted(h_cnt.items()))
    indexes = np.arange(len(labels))
    
    # Add plot variables
    plt.subplot(7, 2, plotVar)
    plt.annotate(crime, xy=(1, 0), xycoords='axes fraction',
                 xytext=(-250, 80), textcoords='offset pixels',
                 horizontalalignment='right',verticalalignment='bottom') # Labels on each plot
    plt.bar(indexes, values, align='center', alpha=0.5, color='grey')
    
    # Plot the results
    plt.xticks(indexes, labels)
    plt.ylabel('Crime count')
    plt.xlabel('Hour of day')
    plt.subplots_adjust(bottom=.01, left=.01, right=2.5, top=4.0, hspace=.75)
    
    # Increment plot variable and reset counter
    plotVar += 1
    h_cnt.clear()
    
plt.show()


#### Comment on plot

**Quick 'facts'**
* Disorderly conduct is high in the morning
* Vehicle theft happens in the evening
* Narcotics peaks in the afternoon
* Driving under the influence is a night-time-thing
* Prostitution most commonly happen between 6PM and 12PM
* People are most ofen disturbed by disorderly conduct in the morning hours 6AM-9AM.
* Number of robbery increases throughout the day.  

**Theories**
* Stealing cars happens at night when it is darker and you will be less easy to recognize.
* Prostitution is for when you get home from work.

The peak of burglary, larceny/theft and vehicle theft, prostitution etc. around noon, I cannot explain.

### Crime per district

In [None]:
# List the 10 districts
crime_district = set(crime_data.PdDistrict)
crime_district

In [None]:
# Initialize crime counters
dist_cnt_c = Counter()
dist_cnt_fc = Counter()
noOfCrimes = []
focuscrimes = ['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT']

# Crimes per district
for dist in crime_district:
    # Extracting district crimes
    crimes  = crime_data.loc[crime_data['PdDistrict'].isin([dist])]
    
    # Using the district crimes to extract the focus crimes
    fcrimes = crimes.loc[crimes['Category'].isin(focuscrimes)]  

    # Count no of crimes for each year
    dist_cnt_c[dist]  = len(crimes.Category.tolist())
    dist_cnt_fc[dist] = len(fcrimes.Category.tolist())

# Printing relevant information about district crime
print "Number of crimes: \n",             dist_cnt_c.items()
print "\nNumber of focus crimes:\n",      dist_cnt_fc.items()
print "\nMost number of crimes:\n",       dist_cnt_c.most_common(1)
print "\nMost number of focus crimes:\n", dist_cnt_fc.most_common(1)

### Frequency of crimes in certain neigborhoods

Getting the normalized version of the first histogram

In [None]:
# Get only crime values
p_crime = np.array(crime_count.values())

# Normalize values
allCrimesNormalized = [number/linalg.norm(p_crime) for number in p_crime]

In [None]:
allCrimesNormalized
for val in crime_count.keys():
    if val not in focuscrimes:
        allCrimesNormalized.pop()
allCrimesNormalized

In [None]:
p_crime_d = np.array(dist_cnt_c.values())
# Normalize values
districtCrimesNormalized = [number/linalg.norm(p_crime_d) for number in p_crime_d]
districtCrimesNormalized

## Geo-plotting the coordinate data!

I will be using the pandas dataframe as this is what I have been working with through all exercises.

In [None]:
# Extract geo coordinates dataframe
geo_locations = crime_data[['X','Y']]
geo_locations.columns = ['lon','lat']
#geo_locations2 = geo_locations[:len(geo_locations)/2] # Working on a smaller dataset

In [None]:
# Saving to CSV to check if it works better to load
geo_locations.to_csv('geo_data.csv', sep=',', encoding='utf-8')

In [None]:
# Get values in dict form
geo_dict = geo_locations.to_dict()
#geo_dict.keys()

In [None]:
# Plotting the data w. geoplotlib
gpl.dot(geo_dict,color='b',point_size=1)
#gpl.dot(geo_locations2,color='b',point_size=1)
#bbox = BoundingBox(north=max(location_dict['lat']), west=min(location_dict['lon']), south=max(location_dict['lat']), east=min(location_dict['lon']))
#gpl.set_bbox(bbox)
gpl.show()

In [None]:
from geoplotlib.utils import read_csv

# Reading in entire file and changing names of relevant columns
data2 = read_csv('geo_data.csv')
#data = read_csv('bus.csv')
gpl.dot(data2)
gpl.show()