## Questions

1.	What factors play into police stops.
    a.	Given pieces of data about a particular stop, to what extent can we predict the race of the subject. 
    i.	AM vs PM 
2.	Can we predict what precincts that are more likely to pull over a particular gender/race. 
    a.	AM vs PM
3.	What is the correlation of prerace actually being accurate to actual race. 
4.	Are there times of higher citations. Morning, Afternoon, Evening, Night. 
5. Does the type of traffic stop impact whether a person is searched? H1 If suspicious vehicle stops are related to a person being searched, then the percentage of searches will be higher than the regular traffic stops for the year; H0 If suspicious vehicle stops are not related to a person being serarched, then the percentage of searches will be approximately equal for the year.



In [49]:
# importing dependencies
import numpy as np 
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import *
import requests
import time
import json
import datetime
from pprint import pprint

# Google developer API key
#from config import google_key

# Census developer API key
#from config import census_key

## Data Load, Prep and Review

In [50]:
# reading csv file
StopOriginal = pd.read_csv('MplsStops_base.csv', parse_dates=['date'], infer_datetime_format=True)

StopOriginal['Time'] = StopOriginal.date.dt.time

# def get_sec(time_str):
#     h, m, s = time_str.split(':')
#     return int(h) * 3600 + int(m) * 60 + int(s)

# for time in StopOriginal:
#     convertedTime = get_sec(time['Time'])
#     StopOriginal.append({'convertedTime': convertedTime})

# displaying file header and five rows
StopOriginal.head()

Unnamed: 0.1,Unnamed: 0,idNum,date,problem,MDC,citationIssued,personSearch,vehicleSearch,preRace,race,gender,lat,long,policePrecinct,neighborhood,Time
0,6823,17-000003,2017-01-01 00:00:00,suspicious,MDC,,NO,NO,Unknown,Unknown,Unknown,44.966617,-93.246458,1,Cedar Riverside,00:00:00
1,6824,17-000007,2017-01-01 00:03:00,suspicious,MDC,,NO,NO,Unknown,Unknown,Male,44.98045,-93.27134,1,Downtown West,00:03:00
2,6825,17-000073,2017-01-01 00:23:00,traffic,MDC,,NO,NO,Unknown,White,Female,44.94835,-93.27538,5,Whittier,00:23:00
3,6826,17-000092,2017-01-01 00:33:00,suspicious,MDC,,NO,NO,Unknown,East African,Male,44.94836,-93.28135,5,Whittier,00:33:00
4,6827,17-000098,2017-01-01 00:37:00,traffic,MDC,,NO,NO,Unknown,White,Female,44.979078,-93.262076,1,Downtown West,00:37:00


In [51]:
# for row in StopOriginal:
#     newTime = sum(int(x) * 60 ** i for i,x in enumerate(reversed(row['date'].split(":"))))
#     StopOriginal.append({'Time in Seconds': newTime})

In [52]:
StopOriginal.dtypes

Unnamed: 0                 int64
idNum                     object
date              datetime64[ns]
problem                   object
MDC                       object
citationIssued            object
personSearch              object
vehicleSearch             object
preRace                   object
race                      object
gender                    object
lat                      float64
long                     float64
policePrecinct             int64
neighborhood              object
Time                      object
dtype: object

In [53]:
#stops per precinct 
StopOriginal['policePrecinct'].value_counts()

5    12825
4    12273
3     9792
2     9478
1     7552
Name: policePrecinct, dtype: int64

In [54]:
# storing csv into a pandas dataframe
initDF = pd.DataFrame(StopOriginal)
initDF.head()

Unnamed: 0.1,Unnamed: 0,idNum,date,problem,MDC,citationIssued,personSearch,vehicleSearch,preRace,race,gender,lat,long,policePrecinct,neighborhood,Time
0,6823,17-000003,2017-01-01 00:00:00,suspicious,MDC,,NO,NO,Unknown,Unknown,Unknown,44.966617,-93.246458,1,Cedar Riverside,00:00:00
1,6824,17-000007,2017-01-01 00:03:00,suspicious,MDC,,NO,NO,Unknown,Unknown,Male,44.98045,-93.27134,1,Downtown West,00:03:00
2,6825,17-000073,2017-01-01 00:23:00,traffic,MDC,,NO,NO,Unknown,White,Female,44.94835,-93.27538,5,Whittier,00:23:00
3,6826,17-000092,2017-01-01 00:33:00,suspicious,MDC,,NO,NO,Unknown,East African,Male,44.94836,-93.28135,5,Whittier,00:33:00
4,6827,17-000098,2017-01-01 00:37:00,traffic,MDC,,NO,NO,Unknown,White,Female,44.979078,-93.262076,1,Downtown West,00:37:00


In [55]:
#stops per police precinct after removing 'other' values from MDC column
cleanDF = initDF.drop(initDF[initDF.MDC == 'other'].index)
cleanDF['policePrecinct'].value_counts()

5    11435
4    10726
3     8345
2     7944
1     5249
Name: policePrecinct, dtype: int64

In [56]:
#data just for precinct 1 
precinct_one = cleanDF.loc[cleanDF['policePrecinct'] == 1 ]
precinct_two = cleanDF.loc[cleanDF['policePrecinct'] == 2 ]
precinct_three = cleanDF.loc[cleanDF['policePrecinct'] == 3 ]
precinct_four = cleanDF.loc[cleanDF['policePrecinct'] == 4 ]
precinct_five = cleanDF.loc[cleanDF['policePrecinct'] == 5 ]

In [57]:
one = precinct_one['problem'].value_counts()
one

suspicious    3326
traffic       1923
Name: problem, dtype: int64

In [58]:
# calculating total stops 
precinct_df = pd.DataFrame(cleanDF['policePrecinct'].value_counts())
stops = precinct_df['policePrecinct'].sum()
stops

43699

In [59]:
# Count of Stops by Race
race_count = pd.DataFrame(cleanDF['race'].value_counts())
race_count['%'] = race_count['race'] / stops * 100
race_count['%'] = race_count['%'].map("% {:.2f}".format)
race_count
#plt.pie(race_count['race'])
#plt.show()

Unnamed: 0,race,%
Black,15220,% 34.83
White,11703,% 26.78
Unknown,9219,% 21.10
East African,2188,% 5.01
Latino,1858,% 4.25
Native American,1516,% 3.47
Other,1348,% 3.08
Asian,647,% 1.48


## Q4: Are there instances of higher stops by morning, afternoon, evening, night?

In [60]:
# grouping by time of day
cleanDF['id_date'] = pd.to_datetime(cleanDF['date'], errors='coerce')
cleanDF.set_index('id_date', inplace=True)
# timeDF = cleanDF.groupby(['policePrecinct', pd.Grouper(key='date',freq='6h')]) ['problem'].value_counts()
# timeDF

grouper = pd.Grouper(freq='6h')
cleanDF.index = cleanDF.reset_index()['id_date'].apply(lambda x: x - pd.Timestamp(x.date()))
time_of_day = cleanDF.groupby(grouper).count()
time_of_day

Unnamed: 0_level_0,Unnamed: 0,idNum,date,problem,MDC,citationIssued,personSearch,vehicleSearch,preRace,race,gender,lat,long,policePrecinct,neighborhood,Time
id_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
00:00:00,8813,8813,8813,8813,8813,3531,8813,8813,8813,8813,8799,8813,8813,8813,8813,8813
06:00:00,6681,6681,6681,6681,6681,3103,6681,6681,6681,6681,6673,6681,6681,6681,6681,6681
12:00:00,11809,11809,11809,11809,11809,5466,11809,11809,11809,11809,11793,11809,11809,11809,11809,11809
18:00:00,16396,16396,16396,16396,16396,7010,16396,16396,16396,16396,16373,16396,16396,16396,16396,16396


In [61]:
timeDF = cleanDF.groupby(grouper) ['policePrecinct'].value_counts()
timeDF = pd.DataFrame(timeDF)
timeDF

Unnamed: 0_level_0,Unnamed: 1_level_0,policePrecinct
id_date,policePrecinct,Unnamed: 2_level_1
00:00:00,2,2049
00:00:00,5,1951
00:00:00,3,1790
00:00:00,1,1512
00:00:00,4,1511
06:00:00,5,2362
06:00:00,2,1343
06:00:00,3,1249
06:00:00,4,1137
06:00:00,1,590


In [75]:
timeDF4 = cleanDF.groupby([grouper, 'problem']) ['race'].value_counts()
timeDF4 = pd.DataFrame(timeDF4)
timeDF4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,race
id_date,problem,race,Unnamed: 3_level_1
00:00:00,suspicious,Unknown,1623
00:00:00,suspicious,Black,1348
00:00:00,suspicious,White,851
00:00:00,suspicious,Native American,189
00:00:00,suspicious,Latino,139
00:00:00,suspicious,East African,111
00:00:00,suspicious,Other,100
00:00:00,suspicious,Asian,29
00:00:00,traffic,Black,2091
00:00:00,traffic,White,1236


In [62]:
timeDF2 = cleanDF.groupby(grouper) ['problem'].value_counts()
timeDF2 = pd.DataFrame(timeDF2)
timeDF2

Unnamed: 0_level_0,Unnamed: 1_level_0,problem
id_date,problem,Unnamed: 2_level_1
00:00:00,traffic,4423
00:00:00,suspicious,4390
06:00:00,traffic,3456
06:00:00,suspicious,3225
12:00:00,traffic,6136
12:00:00,suspicious,5673
18:00:00,traffic,9199
18:00:00,suspicious,7197


In [70]:
timeDF3 = cleanDF.groupby(grouper)['race'].value_counts()
timeDF3 = pd.DataFrame(timeDF3)
timeDF3

Unnamed: 0_level_0,Unnamed: 1_level_0,race
id_date,race,Unnamed: 2_level_1
00:00:00,Black,3439
00:00:00,White,2087
00:00:00,Unknown,1769
00:00:00,Latino,442
00:00:00,East African,438
00:00:00,Other,268
00:00:00,Native American,255
00:00:00,Asian,115
06:00:00,White,2439
06:00:00,Black,1646


In [71]:
# stops per precinct by race
# count of stops per race per precinct
one_race_count = pd.DataFrame(precinct_one['race'].value_counts())
one_race_count = one_race_count.rename(columns={'race':'one'})

two_race_count = pd.DataFrame(precinct_two['race'].value_counts())
two_race_count = two_race_count.rename(columns={'race':'two'})

three_race_count = pd.DataFrame(precinct_three['race'].value_counts())
three_race_count = three_race_count.rename(columns={'race':'three'})

four_race_count = pd.DataFrame(precinct_four['race'].value_counts())
four_race_count = four_race_count.rename(columns={'race':'four'})

five_race_count = pd.DataFrame(precinct_five['race'].value_counts())
five_race_count = five_race_count.rename(columns={'race':'five'})

#stops per precinct by race
stops_per_precinct_by_race = pd.concat([one_race_count,two_race_count,three_race_count,four_race_count,five_race_count], axis=1, sort=True)
stops_per_precinct_by_race = stops_per_precinct_by_race.reset_index()
stops_per_precinct_by_race = stops_per_precinct_by_race.rename(columns={'index':'race'})
stops_per_precinct_by_race = stops_per_precinct_by_race[['race','one','two','three','four','five']]
stops_per_precinct_by_race['Total'] = stops_per_precinct_by_race.sum(axis=1)

stops_per_precinct_by_race

Unnamed: 0,race,one,two,three,four,five,Total
0,Asian,65,165,53,177,187,647
1,Black,2176,2226,2059,6183,2576,15220
2,East African,286,364,463,127,948,2188
3,Latino,100,429,543,216,570,1858
4,Native American,192,119,750,108,347,1516
5,Other,165,304,273,228,378,1348
6,Unknown,897,1033,2893,2494,1902,9219
7,White,1368,3304,1311,1193,4527,11703


In [46]:
# precinct stops by gender
one_sex = pd.DataFrame(precinct_one['gender'].value_counts())
two_sex = pd.DataFrame(precinct_two['gender'].value_counts())
three_sex = pd.DataFrame(precinct_three['gender'].value_counts())
four_sex = pd.DataFrame(precinct_four['gender'].value_counts())
five_sex = pd.DataFrame(precinct_five['gender'].value_counts())

precincts_sex = pd.concat([one_sex,two_sex,three_sex,four_sex,five_sex] , axis=1, sort=False)
precincts_sex.columns = ['One','Two','Three','Four','Five']
precincts_sex['Total'] = precincts_sex.sum(axis=1)
precincts_sex['%'] = precincts_sex['Total'] / precincts_sex['Total'].sum() * 100 
precincts_sex['%'] = precincts_sex['%'].map("% {:.2f}".format)
precincts_sex

Unnamed: 0,One,Two,Three,Four,Five,Total,%
Male,3664,5181,4816,6468,7002,27131,% 62.17
Female,985,2109,1598,2297,3026,10015,% 22.95
Unknown,593,645,1922,1943,1389,6492,% 14.88


In [47]:
# total stops by gender
stop_by_gender = precincts_sex[['Total','%']]
stop_by_gender

Unnamed: 0,Total,%
Male,27131,% 62.17
Female,10015,% 22.95
Unknown,6492,% 14.88


In [60]:
# attempt to group by time - fail
timeDF = cleanDF.groupby(pd.Grouper(['Time'], freq='6h'))
timeDF.head()

TypeError: __init__() got multiple values for argument 'freq'

In [49]:
# attempt to group by time - fail #2

# understand patterns of stops by time of day
bins = [0:0:0, 5:59:59, 11:59:59, 17:59:59, 23:59:59]
groups = ["Night","Morning","AfterNoon","Evening"]

cleanDF["Time Segments"] = pd.cut(cleanDF["Time"], bins, labels=groups)
cleanDF.head()

SyntaxError: invalid syntax (<ipython-input-49-ebfe9c1c7b51>, line 2)

## Initial Plots / Testing