#Sources
##How to create a list of a single number https://stackoverflow.com/questions/3459098/create-list-of-single-item-repeated-n-times
##Pivot table documentation https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html
##Sum a row https://www.kite.com/python/answers/how-to-sum-rows-of-a-pandas-dataframe-in-python
##How to proportion data within a row https://stackoverflow.com/questions/50820659/compute-row-percentages-in-pandas-dataframe/50820765

#Notes
##It would have been super helpful for a end sample to be provided like it was in the problemset; it was hard to verbally interpret what the output should look like without checking against other submissions

In [1]:
#do the basics import pandas, read in data, display
import pandas as pd
import numpy as np
data = pd.read_csv("chicago_summer_2018_crime_data.csv")
data.head(n=3)

Unnamed: 0,month,day,year,day_of_week,description,location_description,block,primary_type,district,ward,arrest,domestic,latitude,longitude
0,8,4,2018,Saturday,FROM BUILDING,APARTMENT,039XX W WASHINGTON BLVD,THEFT,11,28.0,False,False,,
1,7,26,2018,Thursday,POCKET-PICKING,RESTAURANT,005XX W MADISON ST,THEFT,1,42.0,False,False,,
2,6,24,2018,Sunday,BOGUS CHECK,GROCERY FOOD STORE,004XX E 34TH ST,DECEPTIVE PRACTICE,2,4.0,False,False,,


In [2]:
#create a smaller datafram
dat = data[["primary_type","day"]]
#call it dat cause it doesn't have as many columns as data, and make it a dataframe
dat = pd.DataFrame(dat)
dat.head(n=3)

Unnamed: 0,primary_type,day
0,THEFT,4
1,THEFT,26
2,DECEPTIVE PRACTICE,24


In [3]:
#create a list of ones as long as the dataframe
occur = [1] * 73373

In [4]:
#add that value to the df caue pivot_wider requires a variable
dat["occur"] = occur
dat.head(n=3)

Unnamed: 0,primary_type,day,occur
0,THEFT,4,1
1,THEFT,26,1
2,DECEPTIVE PRACTICE,24,1


In [5]:
#use the pivot wider function to create a dataframe
#use np.sum because it was listed in the documentation sample; will fix this later to aggregate by porportion
#use fill_value to put in zeros
dat_wide = pd.pivot_table(dat, values='occur', index=['primary_type'], columns=['day'], aggfunc=np.sum, fill_value=0)
dat_wide.head(n=1)

day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
primary_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARSON,4,3,3,2,4,6,5,5,2,2,...,5,1,6,1,2,1,3,6,3,3


In [6]:
#convert to proportions of a whole
#also can't spell proportion so it's titled dat_porp
dat_porp = dat_wide.div(dat_wide.sum(axis=1), axis=0)
dat_porp.head(n=1)

day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
primary_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARSON,0.035714,0.026786,0.026786,0.017857,0.035714,0.053571,0.044643,0.044643,0.017857,0.017857,...,0.044643,0.008929,0.053571,0.008929,0.017857,0.008929,0.026786,0.053571,0.026786,0.026786


In [7]:
#sum the rows to make sure the proportion is equal to one
dat_porp["sum"] = dat_porp.sum(axis=1)
dat_porp.head(n=1)

day,1,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,sum
primary_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARSON,0.035714,0.026786,0.026786,0.017857,0.035714,0.053571,0.044643,0.044643,0.017857,0.017857,...,0.008929,0.053571,0.008929,0.017857,0.008929,0.026786,0.053571,0.026786,0.026786,1.0


In [8]:
#round the values out to two decimal places
dat_porp = dat_porp.round(decimals=2)
dat_porp.head(n=1)

day,1,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,sum
primary_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARSON,0.04,0.03,0.03,0.02,0.04,0.05,0.04,0.04,0.02,0.02,...,0.01,0.05,0.01,0.02,0.01,0.03,0.05,0.03,0.03,1.0


In [10]:
#all rows equalled 1.0 so drop the column
dat_final = dat_porp.drop("sum", axis=1)
dat_final.head(n=31)

day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
primary_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARSON,0.04,0.03,0.03,0.02,0.04,0.05,0.04,0.04,0.02,0.02,...,0.04,0.01,0.05,0.01,0.02,0.01,0.03,0.05,0.03,0.03
ASSAULT,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BATTERY,0.04,0.04,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BURGLARY,0.04,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,...,0.04,0.03,0.04,0.03,0.03,0.04,0.03,0.03,0.03,0.02
CONCEALED CARRY LICENSE VIOLATION,0.05,0.02,0.05,0.05,0.02,0.05,0.05,0.0,0.02,0.05,...,0.02,0.0,0.05,0.07,0.07,0.02,0.02,0.0,0.02,0.05
CRIM SEXUAL ASSAULT,0.06,0.02,0.04,0.05,0.04,0.04,0.03,0.04,0.03,0.03,...,0.03,0.03,0.02,0.03,0.05,0.03,0.03,0.03,0.03,0.01
CRIMINAL DAMAGE,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,...,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.02
CRIMINAL TRESPASS,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.03,...,0.03,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.02
DECEPTIVE PRACTICE,0.04,0.04,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03
GAMBLING,0.07,0.03,0.02,0.01,0.03,0.02,0.03,0.03,0.05,0.04,...,0.02,0.02,0.01,0.04,0.03,0.01,0.02,0.03,0.03,0.03


In [None]:
print(dat_final)