# 2 - Plot Categorical

Import dependencies

In [1]:
import altair as alt
import numpy as np
import pandas as pd

#### Data Retrieval

In [2]:
%%time
url = 'https://www.phoenixopendata.com/dataset/cc08aace-9ca9-467f-b6c1-f0879ab1a358/resource/0ce3411a-2fc6-4302-a33f-167f68608a20/download/crime-data_crime-data_crimestat.csv'
file_path = '../data/crime-data_crime-data_crimestat.csv.gz'
dtypes = {"INC NUMBER": object, "UCR CRIME CATEGORY": object,
          "100 BLOCK ADDR": object, "ZIP": float, "PREMISE TYPE": object} 

phx_crimes = pd.read_csv(file_path, compression='gzip', parse_dates=['OCCURRED ON', 'OCCURRED TO'], dtype=dtypes)

CPU times: user 19 s, sys: 23.4 ms, total: 19 s
Wall time: 19 s


In [3]:
%%time
phx_crimes

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


Unnamed: 0,INC NUMBER,OCCURRED ON,OCCURRED TO,UCR CRIME CATEGORY,100 BLOCK ADDR,ZIP,PREMISE TYPE
0,201600000527709,2015-11-01 00:00:00,2016-03-22 00:36:00,LARCENY-THEFT,33XX W CAMELBACK RD,85017.0,PARKING LOT
1,201500002102327,2015-11-01 00:00:00,2015-11-01 09:00:00,LARCENY-THEFT,51XX N 15TH ST,85014.0,APARTMENT
2,201600000052855,2015-11-01 00:00:00,2016-01-09 00:00:00,MOTOR VEHICLE THEFT,N 43RD AVE & W CACTUS RD,85029.0,SINGLE FAMILY HOUSE
3,201500002102668,2015-11-01 00:00:00,2015-11-01 11:50:00,MOTOR VEHICLE THEFT,69XX W WOOD ST,85043.0,SINGLE FAMILY HOUSE
4,201700001722914,2015-11-01 00:00:00,NaT,LARCENY-THEFT,279XX N 23RD LN,85085.0,SINGLE FAMILY HOUSE
...,...,...,...,...,...,...,...
274294,202000000052683,2020-01-09 23:24:00,NaT,AGGRAVATED ASSAULT,43XX N 35TH AVE,85019.0,APARTMENT
274295,202000000053602,2020-01-09 23:30:00,2020-01-10 06:45:00,LARCENY-THEFT,60XX W MULBERRY DR,85033.0,DRIVEWAY
274296,202000000053907,2020-01-09 23:30:00,2020-01-10 08:30:00,LARCENY-THEFT,42XX N 3RD AVE,85013.0,STREET / ROADWAY / ALLEY / SIDEWALK
274297,202000000058980,2020-01-09 23:30:00,2020-01-10 14:30:00,BURGLARY,43XX N 6TH DR,85013.0,PARKING GARAGE


In [4]:
phx_crimes.columns = ['inc_no', 'dt_start', 'dt_end', 'crime_type', 'hundred_block', 'zip', 'premise']
phx_crimes.dropna(subset=['dt_start'], inplace=True)
crimes = ['ARSON', 'MOTOR VEHICLE THEFT', 'DRUG OFFENSE']
crimes_df = phx_crimes[phx_crimes.crime_type.isin(crimes)].reset_index(drop=True).copy()
crimes_df['dow'] = crimes_df['dt_start'].apply(lambda x: x.weekday())
crimes_df['hour'] = crimes_df['dt_start'].apply(lambda x: x.hour)

arson = crimes_df[crimes_df.crime_type == 'ARSON'].groupby(['dow', 'hour']).size()
gta = crimes_df[crimes_df.crime_type == 'MOTOR VEHICLE THEFT'].groupby(['dow', 'hour']).size()
drug = crimes_df[crimes_df.crime_type == 'DRUG OFFENSE'].groupby(['dow', 'hour']).size()
df2 = pd.concat((arson, gta, drug), axis=1, keys=['ARSON', 'MOTOR_VEHICLE_THEFT', 'DRUG_OFFENSE'])
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ARSON,MOTOR_VEHICLE_THEFT,DRUG_OFFENSE
dow,hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,43.0,393,334
0,1,10.0,118,74
0,2,8.0,114,65
0,3,9.0,80,46
0,4,10.0,93,27


# "Tidy" up the data

Again, we have to "tidy" up the data to satisfy Altair's expectations.

In [5]:
tmpdata = df2.stack().reset_index()
data = tmpdata.rename(columns={'level_2':'offense', 0: 'count'})
data

Unnamed: 0,dow,hour,offense,count
0,0,0,ARSON,43.0
1,0,0,MOTOR_VEHICLE_THEFT,393.0
2,0,0,DRUG_OFFENSE,334.0
3,0,1,ARSON,10.0
4,0,1,MOTOR_VEHICLE_THEFT,118.0
...,...,...,...,...
498,6,22,MOTOR_VEHICLE_THEFT,343.0
499,6,22,DRUG_OFFENSE,125.0
500,6,23,ARSON,13.0
501,6,23,MOTOR_VEHICLE_THEFT,232.0


## Generate Plot

In [6]:
alt.Chart(data).mark_bar().encode(
    x='hour:O',
    y='count',
    color=alt.Color('offense', legend=alt.Legend(orient="left")),
    column='dow'
)

Midnight, (hour = 0) is clearly some sort of catch all/default value. 