# Hate Crime Data Extraction and Transformation

1. Create dataframe from FBI hate crimes spreadsheet and explore
2. Create dataframes for needed database tables and export to csv files 

In [46]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import requests
from pathlib import Path

## Create/Explore FBI Hate Crime Dataframe

In [9]:
# Path to hate crime data
path = Path('../resources/fbi_hate_crime_data.csv')

# Load hate crime data
crime_data_df = pd.read_csv(path, index_col=0)

# Display sample data
crime_data_df.head()

Unnamed: 0_level_0,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
45,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
46,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
47,1991,AR0670000,Sevier,,County,AR,Arkansas,West South Central,South,8D,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S


In [10]:
# Remove data before 2009
crime_data_df = crime_data_df[crime_data_df['data_year'] > 2008]

# Display sample data
crime_data_df.head()

Unnamed: 0_level_0,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141003,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,White,Not Specified,1,Simple Assault,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141004,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Unknown,Not Specified,1,Intimidation,1.0,Residence/Home,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",Individual,S,S
141005,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,White,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141006,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Multiple,Not Specified,4,Aggravated Assault,4.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141007,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Asian,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S


In [11]:
crime_data_df.dtypes

data_year                         int64
ori                              object
pug_agency_name                  object
pub_agency_unit                  object
agency_type_name                 object
state_abbr                       object
state_name                       object
division_name                    object
region_name                      object
population_group_code            object
population_group_description     object
incident_date                    object
adult_victim_count              float64
juvenile_victim_count           float64
total_offender_count              int64
adult_offender_count            float64
juvenile_offender_count         float64
offender_race                    object
offender_ethnicity               object
victim_count                      int64
offense_name                     object
total_individual_victims        float64
location_name                    object
bias_desc                        object
victim_types                     object


## Create CSV Files for Database Tables

### Table: agency_types

In [19]:
# Create dataframe for agency_types and rename index
agency_types_df = pd.DataFrame(crime_data_df['agency_type_name'].unique(), columns=['agency_type'])
agency_types_df.index.name = 'agency_type_id'
agency_types_df = agency_types_df.rename(index={5:2, 4:3, 7:4, 2:5, 3:7}).sort_index()

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - agency_type: {agency_types_df['agency_type'].map(lambda x: len(x)).max()}")

# Display dataframe
agency_types_df

Max column size - agency_type: 21


Unnamed: 0_level_0,agency_type
agency_type_id,Unnamed: 1_level_1
0,City
1,County
2,State Police
3,Other State Agency
4,Federal
5,University or College
6,Tribal
7,Other


In [21]:
# Export agency_types dataframe to csv 
agency_types_df.to_csv('data/agency_types.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: agencies

In [36]:
# Create agencies csv
agencies_df = pd.DataFrame(crime_data_df['pug_agency_name'].unique(), columns=['agency'])
agencies_df.index.name = 'agency_id'

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - agency: {agencies_df['agency'].map(lambda x: len(x)).max()}")

# Display dataframe
agencies_df

Max column size - agency: 67


Unnamed: 0_level_0,agency
agency_id,Unnamed: 1_level_1
0,Anchorage
1,Bethel
2,Hoover
3,Mobile
4,Leesburg
...,...
5308,Fall Creek
5309,Monona
5310,Two Rivers
5311,Ronceverte


In [None]:
# Export agencyies dataframe to csv 
agencies_df.to_csv('data/agencies.csv')

# Confirm that export completed
print('Dataframe exported to csv')

### Table: agency_oris

In [54]:
# Create agency_oris csv
# Code Reference for as_index:
# https://stackoverflow.com/questions/21767900/how-to-move-pandas-data-from-index-to-column-after-multiple-groupby
agency_oris_df = crime_data_df.loc[:, ['ori', 'pug_agency_name', 'pub_agency_unit', 'agency_type_name']].groupby('ori', as_index=False, dropna=False).first()
agency_oris_df.rename(columns={'pug_agency_name': 'agency', 'pub_agency_unit': 'agency_unit', 'agency_type_name': 'agency_type'}, inplace=True)

# Display dataframe
agency_oris_df

Unnamed: 0,ori,agency,agency_unit,agency_type
0,AK0010100,Anchorage,,City
1,AK0010200,Fairbanks,,City
2,AK0010300,Juneau,,City
3,AK0010600,Nome,,City
4,AK0010700,Petersburg,,City
...,...,...,...,...
7664,WY0190100,Green River,,City
7665,WY0190200,Rock Springs,,City
7666,WY0200100,Jackson,,City
7667,WY0210100,Evanston,,City


In [55]:
# Import csv to easily merge with dataframe
agencies = pd.read_csv('data/agencies.csv')
agency_types = pd.read_csv('data/agency_types.csv')

# Merge dataframes to get id fields and remove unneeded columns
agency_oris_df = pd.merge(agency_oris_df, agencies, how='right', on='agency')
agency_oris_df = pd.merge(agency_oris_df, agency_types, how='right', on='agency_type')
agency_oris_df.drop(columns=['agency','agency_type'], inplace=True)

# Display dataframe
agency_oris_df

Unnamed: 0,ori,agency_unit,agency_id,agency_type_id
0,AK0010100,,0,0
1,KY0560500,,0,0
2,AK0011300,,1,0
3,CT0000900,,1,0
4,AL0011200,,2,0
...,...,...,...,...
7664,OR0260300,,5225,7
7665,TX1700400,Precinct 1,5262,7
7666,TX1700500,Precinct 3,5262,7
7667,TX1015700,,5274,7


In [61]:
# Reorder columns and make ori the index
agency_oris_df = agency_oris_df[['ori', 'agency_id', 'agency_unit', 'agency_type_id']]
agency_oris_df.set_index('ori', inplace=True)

# Display dataframe
agency_oris_df

Unnamed: 0_level_0,agency_id,agency_unit,agency_type_id
ori,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK0010100,0,,0
KY0560500,0,,0
AK0011300,1,,0
CT0000900,1,,0
AL0011200,2,,0
...,...,...,...
OR0260300,5225,,7
TX1700400,5262,Precinct 1,7
TX1700500,5262,Precinct 3,7
TX1015700,5274,,7


In [62]:
# Export agencyies dataframe to csv 
agency_oris_df.to_csv('data/agency_oris.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: states

In [63]:
# # Create dataframe for states
states_df = crime_data_df.loc[:, ['state_abbr', 'state_name', 'division_name', 'region_name']].groupby('state_abbr').first()
states_renamed_df = states_df.rename(columns={'state_name': 'state', 'division_name': 'division', 'region_name': 'region'})

# Print maximum size of each column for database schema
print(f"Max column size - state_name: {states_df['state_name'].map(lambda x: len(x)).max()}")
print(f"Max column size - division_name: {states_df['division_name'].map(lambda x: len(x)).max()}")
print(f"Max column size - region_name: {states_df['region_name'].map(lambda x: len(x)).max()}")

# Display sample data
states_renamed_df.head()

Max column size - state_name: 20
Max column size - division_name: 18
Max column size - region_name: 16


Unnamed: 0_level_0,state,division,region
state_abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,Alaska,Pacific,West
AL,Alabama,East South Central,South
AR,Arkansas,West South Central,South
AZ,Arizona,Mountain,West
CA,California,Pacific,West


In [64]:
# Export states dataframe to csv 
states_renamed_df.to_csv('data/states.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: population_groups

In [65]:
# Create dataframe for population groups and rename index
pop_groups_df = crime_data_df.loc[:, ['population_group_code', 'population_group_description']].groupby('population_group_code').first()
pop_groups_final_df = pop_groups_df.rename(columns={'population_group_description': 'population_group'})

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - population_group: {pop_groups_final_df['population_group'].map(lambda x: len(x)).max()}")

pop_groups_final_df

Max column size - population_group: 67


Unnamed: 0_level_0,population_group
population_group_code,Unnamed: 1_level_1
0,"Possessions (Puerto Rico, Guam, Virgin Islands..."
1A,"Cities 1,000,000 or over"
1B,"Cities from 500,000 thru 999,999"
1C,"Cities from 250,000 thru 499,999"
2,"Cities from 100,000 thru 249,999"
3,"Cities from 50,000 thru 99,999"
4,"Cities from 25,000 thru 49,999"
5,"Cities from 10,000 thru 24,999"
6,"Cities from 2,500 thru 9,999"
7,"Cities under 2,500"


In [66]:
# Export dataframe to csv 
pop_groups_df.to_csv('data/population_groups.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: race

In [None]:
# Create dataframe for race and rename index
race_df = pd.DataFrame(crime_data_df['offender_race'].unique(), columns=['race'])
race_df.index.name = 'race_id'
race_df

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - race_id: {race_df['race'].map(lambda x: len(x)).max()}")

race_df

In [None]:
# Organize and clean up dataframe 
# Order rows to match US census ordering
race_df = race_df.rename(index={5:1, 4: 2, 6: 4, 2: 5, 1: 6}).sort_index()
race_df.loc[6] = 'Unknown or Not Specified'
race_df = race_df.drop([7])
race_df

In [None]:
# Export dataframe to csv 
race_df.to_csv('data/race.csv')

# Confirm that export completed
print('Dataframe exported to csv')