In [1]:
import numpy as np
import pandas as pd

### Please read in the Chicago Summer 2018 Crimes Dataset located in the repository folder.

### Using the data wrangling methods covered in class this week, create a new data frame where:

    1. the unit of observation is the crime type (i.e. primary_type),
    2. the column variables corresponds with the day of the month, and
    3. each cell is populated by the proportion of times that crime type was committed over all days of the month
        
For example, assume there were just two days in a month and 2 thefts were committed on the first day, and 1 on the second day, then the proportion of thefts committed on the first day would be .66 and .33 on the second day).

### Make sure that:

    1. all missing values are filled with zeros. Zeros in this case means no crimes were committed that day;
    2. the data is rounded to the second decimal place; and
    3. the data frame is printed at the end of the notebook.


In [2]:
df = pd.read_csv('chicago_summer_2018_crime_data.csv') # load data and create dataframe
df.head() # show head of dataframe

Unnamed: 0,month,day,year,day_of_week,description,location_description,block,primary_type,district,ward,arrest,domestic,latitude,longitude
0,8,4,2018,Saturday,FROM BUILDING,APARTMENT,039XX W WASHINGTON BLVD,THEFT,11,28.0,False,False,,
1,7,26,2018,Thursday,POCKET-PICKING,RESTAURANT,005XX W MADISON ST,THEFT,1,42.0,False,False,,
2,6,24,2018,Sunday,BOGUS CHECK,GROCERY FOOD STORE,004XX E 34TH ST,DECEPTIVE PRACTICE,2,4.0,False,False,,
3,6,13,2018,Wednesday,SIMPLE,RESIDENCE,098XX S EXCHANGE AVE,ASSAULT,4,10.0,False,True,,
4,6,14,2018,Thursday,TO VEHICLE,STREET,001XX S WALLER AVE,CRIMINAL DAMAGE,15,29.0,False,False,,


In [3]:
df1 = df[["day","primary_type"]] # new dataframe with only crime type and day of month
dummies = pd.get_dummies(df1.day) # create dummy variable for all days in the dataset. should be all 31 days since there are 70k+ entries

# check if the number of columns is correct
# 31 columns, one for each day of the month
try:
    len(dummies.columns) != 31
    print("check passed: all dummy columns present")
except:
    print("check failed: missing at least one dummy column")

check passed: all dummy columns present


In [4]:
df1 = pd.concat([df1,dummies],sort=False,axis=1) # concat dummy dataframe to base dataframe
df1 = df1.drop(columns=['day']) # remove day of month column as it is no longer needed

In [5]:
df1 = df1.groupby(by=["primary_type"]).sum() # group by crime type and aggregate by sum, showing number of occurence of that crime for each day of month
df1 = df1.div(df1.sum(axis=1, numeric_only = True), axis=0) # sum the row and divide each value by the sum to find proportion of crimes done on each day of month
df1 = df1.round(2) # round to 2 decimal places
print(df1)

                                     1     2     3     4     5     6     7   \
primary_type                                                                  
ARSON                              0.04  0.03  0.03  0.02  0.04  0.05  0.04   
ASSAULT                            0.04  0.03  0.03  0.04  0.04  0.03  0.03   
BATTERY                            0.04  0.04  0.03  0.04  0.03  0.03  0.03   
BURGLARY                           0.04  0.03  0.03  0.03  0.03  0.04  0.03   
CONCEALED CARRY LICENSE VIOLATION  0.05  0.02  0.05  0.05  0.02  0.05  0.05   
CRIM SEXUAL ASSAULT                0.06  0.02  0.04  0.05  0.04  0.04  0.03   
CRIMINAL DAMAGE                    0.03  0.03  0.03  0.03  0.03  0.03  0.03   
CRIMINAL TRESPASS                  0.04  0.03  0.03  0.03  0.03  0.03  0.03   
DECEPTIVE PRACTICE                 0.04  0.04  0.03  0.03  0.03  0.04  0.03   
GAMBLING                           0.07  0.03  0.02  0.01  0.03  0.02  0.03   
HOMICIDE                           0.02  0.00  0.03 