# Coding Discussion 3: Chicago Summer2018 Crime Dataset
by: Matt Ring

# Imports

## Packages

In [1]:
import pandas as pd

## Data

In [3]:
df = pd.read_csv("chicago_summer_2018_crime_data.csv")

# Data Exploration

In [5]:
df.sample(5)

Unnamed: 0,month,day,year,day_of_week,description,location_description,block,primary_type,district,ward,arrest,domestic,latitude,longitude
45626,7,7,2018,Saturday,AUTOMOBILE,STREET,031XX W 63RD ST,MOTOR VEHICLE THEFT,8,15.0,False,False,41.778996,-87.701473
49436,7,2,2018,Monday,OVER $500,APARTMENT,005XX S CLINTON ST,THEFT,1,2.0,False,False,41.87507,-87.640975
3234,8,30,2018,Thursday,POSS: HEROIN(BRN/TAN),APARTMENT,001XX S PARKSIDE AVE,NARCOTICS,15,29.0,True,False,41.878777,-87.766433
8354,8,23,2018,Thursday,AUTOMOBILE,PARKING LOT/GARAGE(NON.RESID.),049XX W JACKSON BLVD,MOTOR VEHICLE THEFT,15,24.0,False,False,41.876747,-87.748136
60389,6,18,2018,Monday,TO PROPERTY,RESIDENCE,009XX W BARRY AVE,CRIMINAL DAMAGE,19,44.0,False,False,41.938104,-87.653157


In [6]:
df.columns

Index(['month', 'day', 'year', 'day_of_week', 'description',
       'location_description', 'block', 'primary_type', 'district', 'ward',
       'arrest', 'domestic', 'latitude', 'longitude'],
      dtype='object')

In [7]:
len(df)

73373

# Data Cleaning
Creates a new dataframe where:
1. The unit of observation is the crime type (i.e. primary_type),
2. The column variables corresponds with the day of the month, and
3. Each cell is populated by the proportion of times that crime type was committed over all days of the month

## Create the New Dataframe

This would likely be possible using piping in dfply, but I am far more comfortable with pandas. As such, this occurs over a few cells.

In [51]:
# Create a pivot table where rows are the crime and columns are the day of the month
dfNew = df.pivot_table(values='month', columns='day', index='primary_type', 
                       fill_value = 0, aggfunc = lambda x: len(x.dropna()))

The code above uses `aggfunc` to count the number of instances of each crime within the dataset. Thus our "values" don't matter much, you can change it to any column and it will produce the same results. I attempted this with `month`, `year`, and `day_of_week` and received the same results each time. All missing values are filled in with 0, as requested.

In [52]:
# Create a new column which contains the sum of each row
# This value represents the total number of each crime in the dataset
sums = dfNew.sum(axis = 1)
dfNew["sum"] = sums
sums

primary_type
ARSON                                  112
ASSAULT                               5635
BATTERY                              14111
BURGLARY                              3390
CONCEALED CARRY LICENSE VIOLATION       44
CRIM SEXUAL ASSAULT                    430
CRIMINAL DAMAGE                       7931
CRIMINAL TRESPASS                     1779
DECEPTIVE PRACTICE                    4684
GAMBLING                               115
HOMICIDE                               172
HUMAN TRAFFICKING                        2
INTERFERENCE WITH PUBLIC OFFICER       374
INTIMIDATION                            54
KIDNAPPING                              47
LIQUOR LAW VIOLATION                    83
MOTOR VEHICLE THEFT                   2608
NARCOTICS                             3047
NON-CRIMINAL                             8
NON-CRIMINAL (SUBJECT SPECIFIED)         2
OBSCENITY                               21
OFFENSE INVOLVING CHILDREN             532
OTHER OFFENSE                         447

The `sums` column is displayed above to show that our process is working as expected. If we are doing this correctly, `sums` should equal the length of the original dataframe.

In [59]:
print(sums.sum())
print(len(df))
sums.sum() == len(df)

73373
73373


True

And it does! This means it's safe to continue with our analysis.

In [53]:
# Divides each value in a row by the sum for that row
# Then drops the "sum" column and rounds all values to two decimals
dfNew = dfNew.div(dfNew["sum"], axis=0).drop(["sum"], axis = 1).round(2)

In [54]:
# Prints out the finalized dataframe
dfNew

day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
primary_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARSON,0.04,0.03,0.03,0.02,0.04,0.05,0.04,0.04,0.02,0.02,...,0.04,0.01,0.05,0.01,0.02,0.01,0.03,0.05,0.03,0.03
ASSAULT,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BATTERY,0.04,0.04,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BURGLARY,0.04,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,...,0.04,0.03,0.04,0.03,0.03,0.04,0.03,0.03,0.03,0.02
CONCEALED CARRY LICENSE VIOLATION,0.05,0.02,0.05,0.05,0.02,0.05,0.05,0.0,0.02,0.05,...,0.02,0.0,0.05,0.07,0.07,0.02,0.02,0.0,0.02,0.05
CRIM SEXUAL ASSAULT,0.06,0.02,0.04,0.05,0.04,0.04,0.03,0.04,0.03,0.03,...,0.03,0.03,0.02,0.03,0.05,0.03,0.03,0.03,0.03,0.01
CRIMINAL DAMAGE,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,...,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.02
CRIMINAL TRESPASS,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.03,...,0.03,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.02
DECEPTIVE PRACTICE,0.04,0.04,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03
GAMBLING,0.07,0.03,0.02,0.01,0.03,0.02,0.03,0.03,0.05,0.04,...,0.02,0.02,0.01,0.04,0.03,0.01,0.02,0.03,0.03,0.03
