# Coding Discussion No. 3
## Name: Sahithi Adari
### Date: 10/04/20

### Import Packages

In [1]:
import pandas as pd

### Loading & Customizing The Data

In [3]:
#Reading the data as a pandas dataframe 
chitown = pd.read_csv('chicago_summer_2018_crime_data.csv')

#Viewing the first 10 rows to determine what columns are of interest
chitown.head(10)

Unnamed: 0,month,day,year,day_of_week,description,location_description,block,primary_type,district,ward,arrest,domestic,latitude,longitude
0,8,4,2018,Saturday,FROM BUILDING,APARTMENT,039XX W WASHINGTON BLVD,THEFT,11,28.0,False,False,,
1,7,26,2018,Thursday,POCKET-PICKING,RESTAURANT,005XX W MADISON ST,THEFT,1,42.0,False,False,,
2,6,24,2018,Sunday,BOGUS CHECK,GROCERY FOOD STORE,004XX E 34TH ST,DECEPTIVE PRACTICE,2,4.0,False,False,,
3,6,13,2018,Wednesday,SIMPLE,RESIDENCE,098XX S EXCHANGE AVE,ASSAULT,4,10.0,False,True,,
4,6,14,2018,Thursday,TO VEHICLE,STREET,001XX S WALLER AVE,CRIMINAL DAMAGE,15,29.0,False,False,,
5,7,2,2018,Monday,CREDIT CARD FRAUD,RESIDENCE,083XX S JUSTINE ST,DECEPTIVE PRACTICE,6,21.0,False,False,,
6,6,1,2018,Friday,PREDATORY,RESIDENCE,087XX S COLFAX AVE,CRIM SEXUAL ASSAULT,4,7.0,False,False,,
7,7,25,2018,Wednesday,OVER $500,RESIDENCE,046XX S LAKE PARK AVE,THEFT,2,4.0,False,False,,
8,7,27,2018,Friday,CRIM SEX ABUSE BY FAM MEMBER,RESIDENCE,004XX E 40TH ST,OFFENSE INVOLVING CHILDREN,2,3.0,False,False,,
9,7,24,2018,Tuesday,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,053XX S CORNELL AVE,DECEPTIVE PRACTICE,2,5.0,False,False,,


Given that we only care about the "primary_type" and "day" columns we're going to customize our data to only include those two columns.

In [4]:
#Cutomized the data to only include the "primary_type" and "day" columns
chitown = pd.read_csv("chicago_summer_2018_crime_data.csv", 
            sep = ",", #Separator in the data
            usecols = ["primary_type","day"], #Requested only the "primary_type" and "day" columns
            na_values = "nan",
            parse_dates=True, #Parse all date features as datetime
            low_memory=True) #Read the file in chunks for lower memory use

#Viewed the first 10 rows to determine our customization happened
chitown.head(10)

Unnamed: 0,day,primary_type
0,4,THEFT
1,26,THEFT
2,24,DECEPTIVE PRACTICE
3,13,ASSAULT
4,14,CRIMINAL DAMAGE
5,2,DECEPTIVE PRACTICE
6,1,CRIM SEXUAL ASSAULT
7,25,THEFT
8,27,OFFENSE INVOLVING CHILDREN
9,24,DECEPTIVE PRACTICE


### Manipulation of The Data

The first thing we want to do is to create a new dataframe called "chicrime" that simply has the total of all crimes (from the chitown dataframe) by crime type. We do this by grouping by the "primary_type", using the size() function, and creating a new column, via a reset index, to house this data.

In [16]:
#Created a new dataframe with total crimes by crimetype and reset the index with a new column called "per_crime"
chicrime = chitown.groupby(["primary_type"]).size().reset_index(name = 'per_crime')

In [17]:
#Output of chicrime
chicrime

Unnamed: 0,primary_type,per_crime
0,ARSON,112
1,ASSAULT,5635
2,BATTERY,14111
3,BURGLARY,3390
4,CONCEALED CARRY LICENSE VIOLATION,44
5,CRIM SEXUAL ASSAULT,430
6,CRIMINAL DAMAGE,7931
7,CRIMINAL TRESPASS,1779
8,DECEPTIVE PRACTICE,4684
9,GAMBLING,115


Next we will do the same thing as before, but this time creating a dataframe called "crimeday" in order to track how many specific crimes happened on a specific day. We do this by grouping by the "primary_type" and "day", using the size() function, and creating a new column, via a reset index, to house this data.

In [30]:
#Created a new dataframe with total crimes by crimetype by day and reset the index with a new column called "day_count"
crimeday = chitown.groupby(["primary_type","day"]).size().reset_index(name = 'day_count')

In [20]:
#Output of crimeday
crimeday

Unnamed: 0,primary_type,day,day_count
0,ARSON,1,4
1,ARSON,2,3
2,ARSON,3,3
3,ARSON,4,2
4,ARSON,5,4
...,...,...,...
797,WEAPONS VIOLATION,27,46
798,WEAPONS VIOLATION,28,51
799,WEAPONS VIOLATION,29,66
800,WEAPONS VIOLATION,30,56


Afterwards, we'll want to merge the 2 datasets above into one final dataframe called "chicrime_day". We will merge the 2 datasets via an inner join, using "primary_type" as our index, and adding the "per_crime" and "day_count" columns.

In [22]:
#Merging the 2 datasets on the "primary_type" and including the "per_crime" and "day_count" columns
chicrime_day = chicrime.merge(crimeday, on =["primary_type"])

In [24]:
#Output of chicrime_day
chicrime_day

Unnamed: 0,primary_type,per_crime,day,day_count
0,ARSON,112,1,4
1,ARSON,112,2,3
2,ARSON,112,3,3
3,ARSON,112,4,2
4,ARSON,112,5,4
...,...,...,...,...
797,WEAPONS VIOLATION,1601,27,46
798,WEAPONS VIOLATION,1601,28,51
799,WEAPONS VIOLATION,1601,29,66
800,WEAPONS VIOLATION,1601,30,56


For this step we'll need to create one last column within the "chicrime_day" dataframe: the percent column. The percent column will simply be the value of day_count/per_crime for each crime by day. We will do this by utilizing the apply function and the lambda function; the lambda function will also round the percentage column to 2 decimal places as well. We're usinga lambda function here, instead of a traditional for loop, in order to cut down on processing time. 

In [26]:
#Created a precent column within the "chicrime_day" dataframe
chicrime_day["percent"] = chicrime_day.apply(
    lambda row: round(row.day_count/row.per_crime, 2), #created a lambda function in order to calculate the precentage average for each row and rounded that valeu to 2 decimal places 
    axis=1) #Defined axis as 1 in order to specify that we're looking at rows and not columns

In [27]:
#Output of chicrime_day with the precent column
chicrime_day

Unnamed: 0,primary_type,per_crime,day,day_count,percent
0,ARSON,112,1,4,0.04
1,ARSON,112,2,3,0.03
2,ARSON,112,3,3,0.03
3,ARSON,112,4,2,0.02
4,ARSON,112,5,4,0.04
...,...,...,...,...,...
797,WEAPONS VIOLATION,1601,27,46,0.03
798,WEAPONS VIOLATION,1601,28,51,0.03
799,WEAPONS VIOLATION,1601,29,66,0.04
800,WEAPONS VIOLATION,1601,30,56,0.03


Lastly we'll take the updated "chicrime_day" dataframe create a pivot table from it, such that we're indexing by "primary_key", the columns are the "day" column, and our values will be populated by the "percent" column.

In [29]:
#Created a pivot table
pd.pivot_table(chicrime_day, #Created a pivot table from the "chicrime_day" dataframe
               index = ["primary_type"], #The rows will be primary_type 
               values = ["percent"], #Each cell will be populated by values from the percent column
               columns = ["day"], #The columns will be day
               fill_value = 0) #Setting each NULL value as 0

Unnamed: 0_level_0,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent,percent
day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
primary_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
ARSON,0.04,0.03,0.03,0.02,0.04,0.05,0.04,0.04,0.02,0.02,...,0.04,0.01,0.05,0.01,0.02,0.01,0.03,0.05,0.03,0.03
ASSAULT,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BATTERY,0.04,0.04,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.02
BURGLARY,0.04,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,...,0.04,0.03,0.04,0.03,0.03,0.04,0.03,0.03,0.03,0.02
CONCEALED CARRY LICENSE VIOLATION,0.05,0.02,0.05,0.05,0.02,0.05,0.05,0.0,0.02,0.05,...,0.02,0.0,0.05,0.07,0.07,0.02,0.02,0.0,0.02,0.05
CRIM SEXUAL ASSAULT,0.06,0.02,0.04,0.05,0.04,0.04,0.03,0.04,0.03,0.03,...,0.03,0.03,0.02,0.03,0.05,0.03,0.03,0.03,0.03,0.01
CRIMINAL DAMAGE,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,...,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.02
CRIMINAL TRESPASS,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.03,...,0.03,0.04,0.04,0.03,0.03,0.04,0.04,0.03,0.03,0.02
DECEPTIVE PRACTICE,0.04,0.04,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.04,0.03,0.03,0.03,0.03
GAMBLING,0.07,0.03,0.02,0.01,0.03,0.02,0.03,0.03,0.05,0.04,...,0.02,0.02,0.01,0.04,0.03,0.01,0.02,0.03,0.03,0.03
