# Hate Crime Data Extraction and Transformation

1. Create dataframe from FBI hate crimes spreadsheet and explore
2. Create dataframes for needed database tables and export to csv files 

In [73]:
# Dependencies and Setup
import pandas as pd
import requests
from pathlib import Path

## Create/Explore FBI Hate Crime Dataframe

In [74]:
# Path to hate crime data
path = Path('../resources/hate_crimes_from_FBI.xlsx')

# Load hate crime data
crime_data_df = pd.read_excel(path, engine='openpyxl', index_col=0)

# Display sample data
crime_data_df.head()

Unnamed: 0_level_0,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
45,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
46,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
47,1991,AR0670000,Sevier,,County,AR,Arkansas,West South Central,South,8D,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S


In [66]:
crime_data_df = crime_data_df[crime_data_df['data_year'] > 2008]
crime_data_df.head()

Unnamed: 0_level_0,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141003,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,White,Not Specified,1,Simple Assault,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141004,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Unknown,Not Specified,1,Intimidation,1.0,Residence/Home,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",Individual,S,S
141005,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,White,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141006,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Multiple,Not Specified,4,Aggravated Assault,4.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141007,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Asian,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S


In [None]:
crime_data_df.dtypes

## Create CSV Files for Database Tables

### Table: agency_types

In [None]:
# Create dataframe for agency_types and rename index
agency_types_df = pd.DataFrame(crime_data_df['agency_type_name'].unique(), columns=['agency_type'])
agency_types_df.index.name = 'agency_type_id'

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - agency_type: {agency_types_df['agency_type'].map(lambda x: len(x)).max()}")

agency_types_df

In [None]:
# Export agency_types dataframe to csv 
agency_types_df.to_csv('data/agency_types.csv')

# Confirm that export completed
print('Dataframe exported to csv')

### Table: states

In [70]:
# # Create dataframe for states
states_df = crime_data_df.loc[:, ['state_abbr', 'state_name', 'division_name', 'region_name']].groupby('state_abbr').first()
states_renamed_df = states_df.rename(columns={'state_name': 'state', 'division_name': 'division', 'region_name': 'region'})

# Print maximum size of each column for database schema
print(f"Max column size - state_name: {states_df['state_name'].map(lambda x: len(x)).max()}")
print(f"Max column size - division_name: {states_df['division_name'].map(lambda x: len(x)).max()}")
print(f"Max column size - region_name: {states_df['region_name'].map(lambda x: len(x)).max()}")

# Display sample data
states_renamed_df.head()

Max column size - state_name: 20
Max column size - division_name: 18
Max column size - region_name: 16


Unnamed: 0_level_0,state,division,region
state_abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,Alaska,Pacific,West
AL,Alabama,East South Central,South
AR,Arkansas,West South Central,South
AZ,Arizona,Mountain,West
CA,California,Pacific,West


In [71]:
# Export states dataframe to csv 
states_renamed_df.to_csv('data/states.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: agencies

In [55]:
# Create agencies csv
# Code Reference for as_index:
# https://stackoverflow.com/questions/21767900/how-to-move-pandas-data-from-index-to-column-after-multiple-groupby
agencies_df = crime_data_df.loc[:, ['pug_agency_name', 'pub_agency_unit', 'agency_type_name']].groupby('pug_agency_name', as_index=False, dropna=False).first()
agency_oris = crime_data_df.loc[:, ['ori', 'pug_agency_name', 'pub_agency_unit', 'agency_type_name']].groupby('ori', as_index=False, dropna=False).first()
agency_units = crime_data_df.loc[:, ['ori', 'pug_agency_name', 'pub_agency_unit', 'agency_type_name']].groupby(['pug_agency_name', 'pub_agency_unit'], as_index=False, dropna=False).first()
test = agency_units.sort_values(by=['ori'])

# Print maximum size of each column for database schema
print(len(agencies_df))
print(len(agency_oris))
print(len(agency_units))

# Display sample data 3945

#test = agency_units[agency_units['agency_type_name'] == 'City'].sort_values(by='ori')
test = agency_oris[agency_oris['agency_type_name'] == 'City'].sort_values(by='ori')
print(len(test))
test.head(20)

5313
7669
5780
5123


Unnamed: 0,ori,pug_agency_name,pub_agency_unit,agency_type_name
0,AK0010100,Anchorage,,City
1,AK0010200,Fairbanks,,City
2,AK0010300,Juneau,,City
3,AK0010600,Nome,,City
4,AK0010700,Petersburg,,City
5,AK0010800,Seward,,City
6,AK0011300,Bethel,,City
7,AK0011600,Kotzebue,,City
8,AK0011700,Palmer,,City
9,AK0012000,Soldotna,,City


In [None]:
# Export dataframe to csv 
states_df.to_csv('data/states.csv')

# Confirm that export completed
print('Dataframe exported to csv')

### Table: population_groups

In [60]:
# Create dataframe for population groups and rename index
pop_groups_df = crime_data_df.loc[:, ['population_group_code', 'population_group_description']].groupby('population_group_code').first()
pop_groups_final_df = pop_groups_df.rename(columns={'population_group_description': 'population_group'})

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - population_group: {pop_groups_final_df['population_group'].map(lambda x: len(x)).max()}")

pop_groups_final_df

Max column size - population_group: 67


Unnamed: 0_level_0,population_group
population_group_code,Unnamed: 1_level_1
0,"Possessions (Puerto Rico, Guam, Virgin Islands..."
2,"Cities from 100,000 thru 249,999"
3,"Cities from 50,000 thru 99,999"
4,"Cities from 25,000 thru 49,999"
5,"Cities from 10,000 thru 24,999"
6,"Cities from 2,500 thru 9,999"
7,"Cities under 2,500"
1A,"Cities 1,000,000 or over"
1B,"Cities from 500,000 thru 999,999"
1C,"Cities from 250,000 thru 499,999"


In [20]:
# Export dataframe to csv 
pop_groups_df.to_csv('data/population_groups.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: race

In [62]:
# Create dataframe for race and rename index
race_df = pd.DataFrame(crime_data_df['offender_race'].unique(), columns=['race'])
race_df.index.name = 'race_id'
race_df

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - agency_type: {race_df['race'].map(lambda x: len(x)).max()}")

race_df

Max column size - agency_type: 41


Unnamed: 0_level_0,race
race_id,Unnamed: 1_level_1
0,White
1,Unknown
2,Multiple
3,Asian
4,American Indian or Alaska Native
5,Black or African American
6,Native Hawaiian or Other Pacific Islander
7,Not Specified


In [63]:
# Organize and clean up dataframe 
# Order rows to match US census ordering
race_df = race_df.rename(index={5:1, 4: 2, 6: 4, 2: 5, 1: 6}).sort_index()
race_df.loc[6] = 'Unknown or Not Specified'
race_df = race_df.drop([7])
race_df

Unnamed: 0_level_0,race
race_id,Unnamed: 1_level_1
0,White
1,Black or African American
2,American Indian or Alaska Native
3,Asian
4,Native Hawaiian or Other Pacific Islander
5,Multiple
6,Unknown or Not Specified


In [64]:
# Export dataframe to csv 
race_df.to_csv('data/race.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv
