## Uniform Crime Reporting Program - FBI

Scrape data from https://ucr.fbi.gov/ucr-publications corresponding to the number of murders in certain metropolitan regions from 2006 to 2016. For each year, you will programmatically navigate to violent crimes, and then to murders, and scrape the table corresponding to Metropolitan Statistical Areas (MSAs). Along with the numerical data, you will also want to scrape and save the text descriptions on violent crimes and murders.

In [1]:
import pandas as pd
import urllib
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
"""
URL Generator is a function which generates the correct url to the xls file for each year
INPUT:
year
OUTPUT:
url
"""
def fbi_url_generator(year):
    hostname='https://ucr.fbi.gov/crime-in-the-u.s/'+str(year)+'/crime-in-the-u.s.-'+str(year)
    if year==2016:
        url=hostname+'/tables/table-4/table-4/output.xls'
    elif year==2015:
        url=hostname+'/tables/table-6/table_6_crime_in_the_united_states_by_metropolitan_statistical_area_'+str(year)+'.xls/output.xls'
    elif year==2014:
        url=hostname+'/tables/table-6/Table_6_Crime_in_the_United_States_by_Metropolitan_Statistical_Area_'+str(year)+'/output.xls'
    elif 2012 <= year <= 2013:
        url=hostname+'/tables/6tabledatadecpdf/table-6/output.xls'
    elif 2010 <= year <= 2011:
        url=hostname+'/tables/table-6/output.xls'
    elif 2006 <= year <= 2009:
        url='https://www2.fbi.gov/ucr/cius'+str(year)+'/data/documents/'+str(year)[2:]+'tbl06.xls' 
    return(url)

In [3]:
"""
DataFrame Merger appends the results into one single dataframe
INPUT:
input - Input DataFrame
OUTPUT:
output - Output DataFrame
"""
def df_merger(input,output):
    if output.empty:
        output=input.copy()
    else:
        output=pd.concat([output,input])
        output.reset_index(drop='Index',inplace=True)
    return(output)

In [4]:
#Define user agent
version='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36'
test=urllib.request.URLopener()
test.addheader('User-Agent',version)

#Define empty dataframes
population=pd.DataFrame()
msa_description=pd.DataFrame()
total_crime_by_MSA=pd.DataFrame()
crime_rate_by_MSA=pd.DataFrame()
crime_by_city_in_MSA=pd.DataFrame()

for year in range(2006,2017):
    #Fetch the xls file from the web server
    test.retrieve(url=fbi_url_generator(year),filename='output.xls')

    #Read xls file into a dataframe. Exclude the first two rows since they are part of the title
    input_df=pd.read_excel("output.xls",skiprows=[0,1],header=1)
    #Drop unwanted columns and rename columns
    input_df=input_df.iloc[:,0:12] 
    input_df.columns=['Metropolitan Statistical Area', 'Counties/principal cities','Population', 'Violent crime','Murder and nonnegligent manslaughter', 'Rape1', 'Robbery','Aggravated assault', 'Property crime', 'Burglary', 'Larceny-theft','Motor vehicle theft']
    #Add year
    input_df['year']=year
    
    #POPULATION EXTRACTION
    population_stg=input_df[input_df['Metropolitan Statistical Area'].notnull()].loc[:,['Metropolitan Statistical Area','Population','year']]
    population_stg.dropna(how='any',inplace=True)         #Remove any rows without a population
    population_stg.reset_index(drop='Index',inplace=True) #Reset index
    #Merge Results
    population=df_merger(population_stg,population) 
    #Fill all cells of MSA
    input_df['Metropolitan Statistical Area']=input_df['Metropolitan Statistical Area'].fillna(method='ffill')
    #Drop population lines
    input_df.drop(input_df[(input_df['Counties/principal cities'].isnull()) & (input_df['Population'].notnull())].index,inplace=True)
    
    #MSA DESCRIPTION EXTRACTION
    msa_description_stg=input_df[input_df['Counties/principal cities'].str.startswith('Includes', na=False)].loc[:,['Metropolitan Statistical Area','Counties/principal cities']]
    msa_description_stg.reset_index(drop='Index',inplace=True)
    #Merge Results
    msa_description=df_merger(msa_description_stg,msa_description)
    #Drop Description rows
    input_df.drop(input_df[input_df['Counties/principal cities'].str.startswith('Includes', na=False)].index,inplace=True)
    
    #REMOVE FOOTNOTES
    input_df.drop(input_df[input_df['Metropolitan Statistical Area'].str[0].str.isdigit()].index,inplace=True)
    
    #TOTAL CRIME BY MSA EXTRACTION
    total_crime_by_MSA_stg=input_df[input_df['Population']==1]
    total_crime_by_MSA_stg['Total vs Estimated']=total_crime_by_MSA_stg['Counties/principal cities'].str[0]
    total_crime_by_MSA_stg.drop(['Population','Counties/principal cities'], axis=1,inplace=True)
    total_crime_by_MSA_stg.reset_index(drop='Index',inplace=True)
    total_crime_by_MSA=df_merger(total_crime_by_MSA_stg,total_crime_by_MSA)
    input_df.drop(input_df[input_df['Counties/principal cities'].str.startswith(('Total','Estimated'), na=False)].index,inplace=True)

    #CRIME RATE BY MSA EXTRACTION    
    crime_rate_by_MSA_stg=input_df[input_df['Counties/principal cities'].str.startswith(('Rate'), na=False)]
    crime_rate_by_MSA_stg.drop(['Population','Counties/principal cities'], axis=1,inplace=True)
    crime_rate_by_MSA_stg.reset_index(drop='Index',inplace=True)
    crime_rate_by_MSA=df_merger(crime_rate_by_MSA_stg,crime_rate_by_MSA)
    input_df.drop(input_df[input_df['Counties/principal cities'].str.startswith(('Rate'), na=False)].index,inplace=True)
    
    #CRIME BY CITIES WITHIN MSA EXTRACTION 
    crime_by_city_in_MSA=df_merger(input_df,crime_by_city_in_MSA)

In [5]:
population.head(4)

Unnamed: 0,Metropolitan Statistical Area,Population,year
0,"Abilene, TX M.S.A.1",162776,2006
1,"Albany, GA M.S.A.",168071,2006
2,"Albany-Schenectady-Troy, NY M.S.A.",851151,2006
3,"Albuquerque, NM M.S.A.",808790,2006


In [6]:
#Need to remove dups
msa_description.head(4)

Unnamed: 0,Metropolitan Statistical Area,Counties/principal cities
0,"Abilene, TX M.S.A.1","Includes Callahan,1 Jones, and Taylor Counties"
1,"Albany, GA M.S.A.","Includes Baker, Dougherty, Lee, Terrell, and W..."
2,"Albany-Schenectady-Troy, NY M.S.A.","Includes Albany, Rensselaer, Saratoga, Schenec..."
3,"Albuquerque, NM M.S.A.","Includes Bernalillo, Sandoval, Torrance, and V..."


In [7]:
total_crime_by_MSA.head(4)

Unnamed: 0,Metropolitan Statistical Area,Violent crime,Murder and nonnegligent manslaughter,Rape1,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,year,Total vs Estimated
0,"Abilene, TX M.S.A.1",638,6,75,109,448,5741,1531,3852,358,2006,T
1,"Albany, GA M.S.A.",676,11,37,265,363,7171,2162,4494,515,2006,E
2,"Albany-Schenectady-Troy, NY M.S.A.",3165,18,207,964,1976,23349,4849,17314,1186,2006,T
3,"Albuquerque, NM M.S.A.",6291,72,420,1410,4389,39023,8972,23336,6715,2006,E


In [8]:
crime_rate_by_MSA.head(4)

Unnamed: 0,Metropolitan Statistical Area,Violent crime,Murder and nonnegligent manslaughter,Rape1,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,year
0,"Abilene, TX M.S.A.1",391.9,3.7,46.1,67.0,275.2,3526.9,940.6,2366.4,219.9,2006
1,"Albany, GA M.S.A.",402.2,6.5,22.0,157.7,216.0,4266.6,1286.4,2673.9,306.4,2006
2,"Albany-Schenectady-Troy, NY M.S.A.",371.8,2.1,24.3,113.3,232.2,2743.2,569.7,2034.2,139.3,2006
3,"Albuquerque, NM M.S.A.",777.8,8.9,51.9,174.3,542.7,4824.9,1109.3,2885.3,830.3,2006


In [9]:
crime_by_city_in_MSA.head(4)

Unnamed: 0,Metropolitan Statistical Area,Counties/principal cities,Population,Violent crime,Murder and nonnegligent manslaughter,Rape1,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,year
0,"Abilene, TX M.S.A.1",City of Abilene,118009,554,5,67,107,375,5045,1282,3460,303,2006
1,"Albany, GA M.S.A.",City of Albany,77815,553,8,31,243,271,5279,1645,3235,399,2006
2,"Albany-Schenectady-Troy, NY M.S.A.",City of Albany,93773,1217,5,50,388,774,4820,1058,3521,241,2006
3,"Albany-Schenectady-Troy, NY M.S.A.",City of Schenectady,61444,712,6,52,309,345,3449,1119,1994,336,2006
