# Crime Data

In [1]:
import _pickle as pickle  # using cPickle

import numpy as np
import pandas as pd

from IPython.display import clear_output

Download the file from here: https://www.openicpsr.org/openicpsr/project/100707/version/V7/view;jsessionid=32DB80E49822A7D63A01ECD6C39324C7?path=/openicpsr/100707/fcr:versions/V7/ucr_offenses_known_monthly_1960_2016_dta.zip&type=file and then extract it into a file called `crime_data`. The only years needed are the years 1980-2009, so you may delete the rest if you wish.

In [2]:
required_columns = [
    'year',
    'month',
    'last_month_reported',
    'fips_state_county_code',
    'total_population',
    'act_murder',
    'act_manslaughter',
    'act_rape_total',
    'act_aggravated_assault',
    'act_simple_assault',
    'act_robbery_total',
    'act_burglary_total',
    'act_theft_total',
    'act_mtr_vhc_theft_total'
]

In [3]:
column_rename_dict = {
    'fips_state_county_code': 'fips',
    'total_population': 'population',
    'act_murder': 'murder',
    'act_manslaughter': 'manslaugther',
    'act_rape_total': 'rape',
    'act_aggravated_assault': 'aggravated assault',
    'act_simple_assault': 'simple assault',
    'act_robbery_total': 'robbery',
    'act_burglary_total': 'burglary',
    'act_theft_total': 'larceny',
    'act_mtr_vhc_theft_total': 'vehicle theft'
}

In [4]:
# dict from year to cleaned crime dataframe
crime_df_dict = dict()

In [5]:
%%time
# no need to repeat after pickling (only ever do once)
# time required estimate: 2-3+ minutes
years = range(1980, 2010)
for year in years:
    print(f'cleaning dataframe for: {year} ({year-1980+1}/{len(years)})')
    clear_output(wait=True)
    df = pd.read_stata(f'crime_data/ucr_offenses_known_monthly_{year}.dta')
    df = df[required_columns]
    df = df[df['last_month_reported'] == 'december is the last month reported']
    df.drop(columns='last_month_reported', inplace=True)
    df.rename(columns=column_rename_dict, inplace=True)
    crime_df_dict[year] = df

CPU times: user 2min 43s, sys: 14.7 s, total: 2min 58s
Wall time: 2min 13s


In [6]:
with open('pickles/crime_df_dict.pkl', 'wb') as file:
    pickle.dump(crime_df_dict, file)

In [7]:
with open('pickles/crime_df_dict.pkl', 'rb') as file:
    crime_df_dict = pickle.load(file)

In [8]:
all_crime_df = pd.concat(list(crime_df_dict.values()))

In [9]:
all_crime_df.head()

Unnamed: 0,year,month,fips,population,murder,manslaugther,rape,aggravated assault,simple assault,robbery,burglary,larceny,vehicle theft
0,1980,january,2020,173992.0,1.0,0.0,11.0,18.0,72.0,24.0,213.0,451.0,94.0
1,1980,february,2020,173992.0,0.0,0.0,12.0,25.0,101.0,31.0,259.0,473.0,60.0
2,1980,march,2020,173992.0,2.0,0.0,8.0,15.0,120.0,28.0,262.0,453.0,85.0
3,1980,april,2020,173992.0,0.0,0.0,9.0,28.0,87.0,19.0,184.0,654.0,75.0
4,1980,may,2020,173992.0,2.0,0.0,17.0,22.0,116.0,25.0,203.0,645.0,69.0


In [10]:
with open('pickles/all_crime_df.pkl', 'wb') as file:
    pickle.dump(all_crime_df, file)

In [11]:
with open('pickles/all_crime_df.pkl', 'rb') as file:
    all_crime_df = pickle.load(file)