## Background

The FBI collates incident-based data by state, summary data with estimates, and data on specific topics like assaults on law enforcement officers, hate crime, or human trafficking are available for download in CSV files. Data is also available via the Crime Data API access to UCR data. The Uniform Crime Reporting (UCR) Program provided updated data for 2017 on September 24, 2018.

## Data Source

Zip files were sourced from https://crime-data-explorer.fr.cloud.gov/downloads-and-docs and placed into the data/NIBRS folder.  


In [13]:
!mkdir -p ../data/NIBRS

In [15]:
import wget
from tqdm import tqdm, tnrange, tqdm_notebook

output_directory = "data/NIBRS"
host = "http://s3-us-gov-west-1.amazonaws.com"
paths = [
    "cg-d4b776d0-d898-4153-90c8-8336f86bdfec/2017/MA-2017.zip",
    "cg-d4b776d0-d898-4153-90c8-8336f86bdfec/2016/MA-2016.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2015/MA-2015.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2014/MA-2014.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2013/MA-2013.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2012/MA-2012.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2011/MA-2011.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2010/MA-2010.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2009/MA-2009.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2008/MA-2008.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2007/MA-2007.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2006/MA-2006.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2005/MA-2005.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2004/MA-2004.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2003/MA-2003.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2002/MA-2002.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2001/MA-2001.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/2000/MA-2000.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/1999/MA-1999.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/1998/MA-1998.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/1997/MA-1997.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/1996/MA-1996.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/1995/MA-1995.zip",
    "cg-d3f0433b-a53e-4934-8b94-c678aa2cbaf3/1994/MA-1994.zip"]
urls = [f"{host}/{p}" for p in paths]
[wget.download(url, out=output_directory) for url in tqdm_notebook(urls)]

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

['../data/NIBRS/MA-2017.zip',
 '../data/NIBRS/MA-2016.zip',
 '../data/NIBRS/MA-2015.zip',
 '../data/NIBRS/MA-2014.zip',
 '../data/NIBRS/MA-2013.zip',
 '../data/NIBRS/MA-2012.zip',
 '../data/NIBRS/MA-2011.zip',
 '../data/NIBRS/MA-2010.zip',
 '../data/NIBRS/MA-2009.zip',
 '../data/NIBRS/MA-2008.zip',
 '../data/NIBRS/MA-2007.zip',
 '../data/NIBRS/MA-2006.zip',
 '../data/NIBRS/MA-2005.zip',
 '../data/NIBRS/MA-2004.zip',
 '../data/NIBRS/MA-2003.zip',
 '../data/NIBRS/MA-2002.zip',
 '../data/NIBRS/MA-2001.zip',
 '../data/NIBRS/MA-2000.zip',
 '../data/NIBRS/MA-1999.zip',
 '../data/NIBRS/MA-1998.zip',
 '../data/NIBRS/MA-1997.zip',
 '../data/NIBRS/MA-1996.zip',
 '../data/NIBRS/MA-1995.zip',
 '../data/NIBRS/MA-1994.zip']

The following code cleans up the data and attempts to join the tables relevant for answering our questions on expungement.

In [16]:
import os
import pandas as pd
import zipfile

def normalize_columns(df):
    """Clean up columns to allow for joining across data frames more easily"""
    df.columns = map(str.lower, df.columns)
    if "ff_line_number" in df.columns:
        df = df.drop("ff_line_number", axis=1)
    if "hc_flag" in df.columns:
        df = df.drop("hc_flag", axis=1)
    if "data_year" in df.columns:
        df = df.drop("data_year", axis=1)
    return df

def process_year(year):
    """Processes a single year's worth of CSVs into a single dataframe"""
    zf = zipfile.ZipFile(f'data/NIBRS/MA-{year}.zip')
    csv_files = [x.filename for x in zf.infolist() if x.filename.endswith(".csv")]
    keys = {file_name: file_name.lower().replace(".csv","").replace("ma/","") for file_name in csv_files}
    data = {keys[file_name]: normalize_columns(pd.read_csv(zf.open(file_name), low_memory=False, encoding='latin')) for file_name in csv_files}
    
    df = data["nibrs_incident"].\
        merge(data["nibrs_offense"], on=["incident_id"], suffixes=(False, False)).\
        merge(data["nibrs_offense_type"], on=["offense_type_id"], suffixes=(False, False)).\
        merge(data["nibrs_arrestee"], on=["incident_id", "offense_type_id"], suffixes=(False, False)).\
        merge(data["nibrs_arrestee_weapon"], on=["arrestee_id"], suffixes=(False, False)).\
        merge(data["nibrs_weapon_type"], on=["weapon_id"], suffixes=(False, False)).\
        merge(data["nibrs_age"], on=["age_id"], suffixes=(False, False)).\
        merge(data["nibrs_arrest_type"], on=["arrest_type_id"], suffixes=(False, False)).\
        merge(data["nibrs_ethnicity"], on=["ethnicity_id"], suffixes=(False, False)).\
        merge(data["nibrs_victim_offense"], on=["offense_id"], suffixes=(False, False)).\
        merge(data["nibrs_victim_injury"], on=["victim_id"], suffixes=(False, False)).\
        merge(data["nibrs_injury"], on=["injury_id"], suffixes=(False, False)).\
        merge(data["agency_participation"], on=["agency_id"], suffixes=(False, False))
    return df

In [17]:
# Loop through and create one large dataframe of all the data from 1995 to 2015
# Still working through formatting changes from 2016 and 2017
df = pd.concat(map(process_year, tnrange(1995, 2016)), sort=True)

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

In [18]:
df.head()

Unnamed: 0,age_code,age_id,age_name,age_num,age_range_high_num,age_range_low_num,agency_id,agency_name,agency_ori,agency_population,...,shr_flag,state_abbr,state_name,submission_date,under_18_disposition_code,victim_id,weapon_code,weapon_id,weapon_name,year
0,AG,5,Age in Years,35.0,,,7859,Worcester,MA0146000,166290,...,N,MA,Massachusetts,,,3572601,1,1,Unarmed,1995
1,AG,5,Age in Years,41.0,,,7859,Worcester,MA0146000,166290,...,N,MA,Massachusetts,,,3582954,1,1,Unarmed,1995
2,AG,5,Age in Years,37.0,,,7859,Worcester,MA0146000,166290,...,N,MA,Massachusetts,,,3584222,1,1,Unarmed,1995
3,AG,5,Age in Years,48.0,,,7859,Worcester,MA0146000,166290,...,N,MA,Massachusetts,,,3584124,1,1,Unarmed,1995
4,AG,5,Age in Years,32.0,,,7859,Worcester,MA0146000,166290,...,N,MA,Massachusetts,,,3585238,1,1,Unarmed,1995


In [19]:
df.columns

Index(['age_code', 'age_id', 'age_name', 'age_num', 'age_range_high_num',
       'age_range_low_num', 'agency_id', 'agency_name', 'agency_ori',
       'agency_population', 'arrest_date', 'arrest_num', 'arrest_type_code',
       'arrest_type_id', 'arrest_type_name', 'arrestee_id', 'arrestee_seq_num',
       'attempt_complete_flag', 'cargo_theft_flag', 'clearance_ind',
       'cleared_except_date', 'cleared_except_id', 'covered', 'crime_against',
       'ct_flag', 'data_home', 'ddocname', 'did', 'ethnicity_code',
       'ethnicity_id', 'ethnicity_name', 'hc_code', 'incident_date',
       'incident_hour', 'incident_id', 'incident_number', 'incident_status',
       'injury_code', 'injury_id', 'injury_name', 'location_id',
       'method_entry_code', 'months_reported', 'multiple_indicator',
       'nibrs_arrestee_weapon_id', 'nibrs_month_id', 'nibrs_months_reported',
       'nibrs_participated', 'nibrs_reported', 'num_premises_entered',
       'offense_category_name', 'offense_code', 'offen

In [20]:
df[["arrestee_id", "incident_date"]].\
    groupby(by=['arrestee_id']).\
    agg(["count"]).\
    sort_values(by=("incident_date","count"))

Unnamed: 0_level_0,incident_date
Unnamed: 0_level_1,count
arrestee_id,Unnamed: 1_level_2
690533,1
15603271,1
15603270,1
15603269,1
15603267,1
15603266,1
15603263,1
15603262,1
15603261,1
15603256,1
