# Crime Data

In [2]:
import _pickle as pickle  # using cPickle

import numpy as np
import pandas as pd

from IPython.display import clear_output

Download the file from here: https://www.openicpsr.org/openicpsr/project/100707/version/V7/view;jsessionid=32DB80E49822A7D63A01ECD6C39324C7?path=/openicpsr/100707/fcr:versions/V7/ucr_offenses_known_monthly_1960_2016_dta.zip&type=file and then extract it into a file called `crime_data`. The only years needed are the years 1980-2009, so you may delete the rest if you wish.

In [2]:
required_columns = [
    'year',
    'month',
    'last_month_reported',
    'fips_state_county_code',
    'total_population',
    'act_murder',
    'act_manslaughter',
    'act_rape_total',
    'act_aggravated_assault',
    'act_simple_assault',
    'act_robbery_total',
    'act_burglary_total',
    'act_theft_total',
    'act_mtr_vhc_theft_total'
]

In [3]:
column_rename_dict = {
    'fips_state_county_code': 'fips',
    'total_population': 'population',
    'act_murder': 'murder',
    'act_manslaughter': 'manslaugther',
    'act_rape_total': 'rape',
    'act_aggravated_assault': 'aggravated assault',
    'act_simple_assault': 'simple assault',
    'act_robbery_total': 'robbery',
    'act_burglary_total': 'burglary',
    'act_theft_total': 'larceny',
    'act_mtr_vhc_theft_total': 'vehicle theft'
}

In [3]:
df = pd.read_stata(f'crime_data/ucr_offenses_known_monthly_2000.dta')

In [5]:
df.head()

Unnamed: 0,ori,ori9,year,month,date,state,state_abb,last_month_reported,fips_state_code,fips_county_code,...,unfound_murder,unfound_other_vhc_theft,unfound_other_weapon_assault,unfound_other_weapon_robbery,unfound_rape_total,unfound_robbery_total,unfound_simple_assault,unfound_strong_arm_robbery,unfound_theft_total,unfound_truck_bus_theft
0,AK00101,AK0010100,2000,january,2000-01-01,alaska,AK,december is the last month reported,2,20,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,9.0,6.0
1,AK00101,AK0010100,2000,february,2000-02-01,alaska,AK,december is the last month reported,2,20,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0
2,AK00101,AK0010100,2000,march,2000-03-01,alaska,AK,december is the last month reported,2,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0
3,AK00101,AK0010100,2000,april,2000-04-01,alaska,AK,december is the last month reported,2,20,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,13.0,5.0
4,AK00101,AK0010100,2000,may,2000-05-01,alaska,AK,december is the last month reported,2,20,...,0.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,7.0,4.0


In [4]:
# dict from year to cleaned crime dataframe
crime_df_dict = dict()

In [5]:
%%time
# no need to repeat after pickling (only ever do once)
# time required estimate: 2-3+ minutes
years = range(1980, 2010)
for year in years:
    print(f'cleaning dataframe for: {year} ({year-1980+1}/{len(years)})')
    clear_output(wait=True)
    df = pd.read_stata(f'crime_data/ucr_offenses_known_monthly_{year}.dta')
    df = df[required_columns]
    df = df[df['last_month_reported'] == 'december is the last month reported']
    df.drop(columns='last_month_reported', inplace=True)
    df.rename(columns=column_rename_dict, inplace=True)
    crime_df_dict[year] = df

Wall time: 7min 56s


In [6]:
with open('pickles/crime_df_dict.pkl', 'wb') as file:
    pickle.dump(crime_df_dict, file)

In [7]:
with open('pickles/crime_df_dict.pkl', 'rb') as file:
    crime_df_dict = pickle.load(file)

In [34]:
all_crime_df = pd.concat(list(crime_df_dict.values()))

In [35]:
all_crime_df = all_crime_df.assign(state=all_crime_df['fips'].str[:2])

In [36]:
all_crime_df.head()

Unnamed: 0,year,month,fips,population,murder,manslaugther,rape,aggravated assault,simple assault,robbery,burglary,larceny,vehicle theft,state
0,1980,january,2020,173992.0,1.0,0.0,11.0,18.0,72.0,24.0,213.0,451.0,94.0,2
1,1980,february,2020,173992.0,0.0,0.0,12.0,25.0,101.0,31.0,259.0,473.0,60.0,2
2,1980,march,2020,173992.0,2.0,0.0,8.0,15.0,120.0,28.0,262.0,453.0,85.0,2
3,1980,april,2020,173992.0,0.0,0.0,9.0,28.0,87.0,19.0,184.0,654.0,75.0,2
4,1980,may,2020,173992.0,2.0,0.0,17.0,22.0,116.0,25.0,203.0,645.0,69.0,2


In [37]:
month_name_to_number_dict = {
    'january': 1,
    'february': 2,
    'march': 3,
    'april': 4,
    'may': 5,
    'june': 6,
    'july': 7,
    'august': 8,
    'september': 9,
    'october': 10,
    'november': 11,
    'december': 12
}

In [38]:
all_crime_df = all_crime_df.assign(month=all_crime_df['month'].apply(lambda month: month_name_to_number_dict[month]))

In [39]:
# remove rows without fips
all_crime_df = all_crime_df[all_crime_df['fips'] != '']

In [40]:
all_crime_df = all_crime_df.groupby(['year', 'month', 'state', 'fips']).sum()

In [41]:
all_crime_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,population,murder,manslaugther,rape,aggravated assault,simple assault,robbery,burglary,larceny,vehicle theft
year,month,state,fips,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1980,1,1,1001,31972.0,0.0,0.0,0.0,7.0,3.0,1.0,29.0,76.0,0.0
1980,1,1,1003,78135.0,0.0,0.0,1.0,17.0,3.0,5.0,73.0,86.0,10.0
1980,1,1,1005,24132.0,0.0,0.0,1.0,5.0,12.0,1.0,14.0,44.0,6.0
1980,1,1,1007,6431.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980,1,1,1009,36175.0,0.0,0.0,1.0,0.0,0.0,1.0,18.0,9.0,6.0


In [42]:
with open('pickles/all_crime_df.pkl', 'wb') as file:
    pickle.dump(all_crime_df, file)

In [4]:
with open('pickles/all_crime_df.pkl', 'rb') as file:
    all_crime_df = pickle.load(file)