## Coding Discussion 3
## Chau Nguyen

In [1]:
# Set up
import pandas as pd
from dfply import *

# Read csv file
dat = pd.read_csv("chicago_summer_2018_crime_data.csv")

In [2]:
# Check variables
list(dat)

['month',
 'day',
 'year',
 'day_of_week',
 'description',
 'location_description',
 'block',
 'primary_type',
 'district',
 'ward',
 'arrest',
 'domestic',
 'latitude',
 'longitude']

In [3]:
# Distinct number of crime type
dat.primary_type.drop_duplicates().size

31

In [4]:
# Distinct number of days:
dat.day.drop_duplicates().size

31

In [5]:
# Reorder the pandas dataframe so the variables of interest are in front
dat = dat >> select(X.primary_type, X.day, X.month, X.year, everything())
# Print sample
dat.head()

Unnamed: 0,primary_type,day,month,year,day_of_week,description,location_description,block,district,ward,arrest,domestic,latitude,longitude
0,THEFT,4,8,2018,Saturday,FROM BUILDING,APARTMENT,039XX W WASHINGTON BLVD,11,28.0,False,False,,
1,THEFT,26,7,2018,Thursday,POCKET-PICKING,RESTAURANT,005XX W MADISON ST,1,42.0,False,False,,
2,DECEPTIVE PRACTICE,24,6,2018,Sunday,BOGUS CHECK,GROCERY FOOD STORE,004XX E 34TH ST,2,4.0,False,False,,
3,ASSAULT,13,6,2018,Wednesday,SIMPLE,RESIDENCE,098XX S EXCHANGE AVE,4,10.0,False,True,,
4,CRIMINAL DAMAGE,14,6,2018,Thursday,TO VEHICLE,STREET,001XX S WALLER AVE,15,29.0,False,False,,


In [6]:
# Groupby to count the number of occurrence each type of crime takes place on each day
''' Take the original pandas dataframe and group them by columns primary_type and day
    .size() function counts the number of elements in the newly created pandas object from groupby
    .unstack() function puts each value of variable "day" into a separate column in the new dataframe '''

dat_sum = dat.groupby(["primary_type","day"])\
        .size()\
        .unstack("day")
# Print sample
dat_sum.head()

day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
primary_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARSON,4.0,3.0,3.0,2.0,4.0,6.0,5.0,5.0,2.0,2.0,...,5.0,1.0,6.0,1.0,2.0,1.0,3.0,6.0,3.0,3.0
ASSAULT,207.0,188.0,172.0,202.0,209.0,197.0,172.0,195.0,161.0,154.0,...,168.0,182.0,200.0,167.0,174.0,187.0,177.0,194.0,161.0,133.0
BATTERY,511.0,495.0,489.0,576.0,488.0,400.0,455.0,474.0,432.0,438.0,...,423.0,439.0,476.0,485.0,460.0,393.0,450.0,432.0,442.0,274.0
BURGLARY,126.0,109.0,118.0,117.0,101.0,126.0,99.0,96.0,107.0,110.0,...,135.0,102.0,119.0,104.0,104.0,140.0,113.0,107.0,108.0,79.0
CONCEALED CARRY LICENSE VIOLATION,2.0,1.0,2.0,2.0,1.0,2.0,2.0,,1.0,2.0,...,1.0,,2.0,3.0,3.0,1.0,1.0,,1.0,2.0


In [7]:
# Clean up dat_sum
''' We use rename_axis() to delete the "primary_type" row
    primary_type is the index. Because it is empty, we use "rename" it to None to delete it
    We use fillna(0) to replace NaN values with zeros, meaning no crimes of that type were committed that day'''    
dat_sum = dat_sum.rename_axis(None, axis = "index")\
        .fillna(0)
            
# Print sample
dat_sum.head()

day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
ARSON,4.0,3.0,3.0,2.0,4.0,6.0,5.0,5.0,2.0,2.0,...,5.0,1.0,6.0,1.0,2.0,1.0,3.0,6.0,3.0,3.0
ASSAULT,207.0,188.0,172.0,202.0,209.0,197.0,172.0,195.0,161.0,154.0,...,168.0,182.0,200.0,167.0,174.0,187.0,177.0,194.0,161.0,133.0
BATTERY,511.0,495.0,489.0,576.0,488.0,400.0,455.0,474.0,432.0,438.0,...,423.0,439.0,476.0,485.0,460.0,393.0,450.0,432.0,442.0,274.0
BURGLARY,126.0,109.0,118.0,117.0,101.0,126.0,99.0,96.0,107.0,110.0,...,135.0,102.0,119.0,104.0,104.0,140.0,113.0,107.0,108.0,79.0
CONCEALED CARRY LICENSE VIOLATION,2.0,1.0,2.0,2.0,1.0,2.0,2.0,0.0,1.0,2.0,...,1.0,0.0,2.0,3.0,3.0,1.0,1.0,0.0,1.0,2.0


In [8]:
# From the cleaned dat_sum dataframe, create final dataframe with: 
    # the **_unit of observation_** is the crime type (i.e. `primary_type`)
    # the **_column variables_** corresponds with the **_day of the month_**, and
    # **_each cell_** is populated by the **_proportion of times that crime type was committed over all days of the month_**

# Use pandas .apply to apply a lambda function to the column axis (axis = 1) of dat_sum
''' Use .apply( ,axis = 1) to apply a function to the columns of the panda df
    define the lambda function as x divided by the sum of x to get the proportion
    of times that crime type was committed over all days of the month (which we got from x.sum())'''
dat_proportion = dat_sum.apply(lambda x: x / x.sum(), axis = 1)\
                .round(2)\
                .fillna(0)

In [9]:
# Set pandas option to print entire dataframe without truncating
pd.set_option("display.max_rows", None, "display.max_columns", None)

#Print out clean dataframe
dat_proportion

day,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
ARSON,0.04,0.03,0.03,0.02,0.04,0.05,0.04,0.04,0.02,0.02,0.07,0.07,0.02,0.03,0.03,0.02,0.04,0.01,0.04,0.04,0.03,0.04,0.01,0.05,0.01,0.02,0.01,0.03,0.05,0.03,0.03
ASSAULT,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BATTERY,0.04,0.04,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BURGLARY,0.04,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.04,0.03,0.04,0.03,0.03,0.04,0.03,0.03,0.03,0.02
CONCEALED CARRY LICENSE VIOLATION,0.05,0.02,0.05,0.05,0.02,0.05,0.05,0.0,0.02,0.05,0.07,0.05,0.02,0.02,0.02,0.05,0.02,0.02,0.05,0.02,0.0,0.02,0.0,0.05,0.07,0.07,0.02,0.02,0.0,0.02,0.05
CRIM SEXUAL ASSAULT,0.06,0.02,0.04,0.05,0.04,0.04,0.03,0.04,0.03,0.03,0.03,0.02,0.02,0.03,0.03,0.05,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.02,0.03,0.05,0.03,0.03,0.03,0.03,0.01
CRIMINAL DAMAGE,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.02
CRIMINAL TRESPASS,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.02
DECEPTIVE PRACTICE,0.04,0.04,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.04,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03
GAMBLING,0.07,0.03,0.02,0.01,0.03,0.02,0.03,0.03,0.05,0.04,0.03,0.06,0.08,0.05,0.02,0.02,0.03,0.04,0.05,0.04,0.03,0.02,0.02,0.01,0.04,0.03,0.01,0.02,0.03,0.03,0.03
