# Hate Crime Data Extraction and Transformation

1. Create dataframe from FBI hate crimes spreadsheet and explore
2. Create dataframes for needed database tables and export to csv files 

In [1]:
# Dependencies and Setup
import pandas as pd
import requests
from pathlib import Path

## Create/Explore FBI Hate Crime Dataframe

In [3]:
# Path to hate crime data
path = Path('../resources/fbi_hate_crime_data.csv')

# Load hate crime data
crime_data_df = pd.read_csv(path, index_col=0)

# Display sample data
crime_data_df.head()

Unnamed: 0_level_0,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
45,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
46,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
47,1991,AR0670000,Sevier,,County,AR,Arkansas,West South Central,South,8D,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S


In [4]:
crime_data_df = crime_data_df[crime_data_df['data_year'] > 2008]
crime_data_df.head()

Unnamed: 0_level_0,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141003,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,White,Not Specified,1,Simple Assault,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141004,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Unknown,Not Specified,1,Intimidation,1.0,Residence/Home,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",Individual,S,S
141005,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,White,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141006,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Multiple,Not Specified,4,Aggravated Assault,4.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
141007,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,1C,...,Asian,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S


In [5]:
crime_data_df.dtypes

data_year                         int64
ori                              object
pug_agency_name                  object
pub_agency_unit                  object
agency_type_name                 object
state_abbr                       object
state_name                       object
division_name                    object
region_name                      object
population_group_code            object
population_group_description     object
incident_date                    object
adult_victim_count              float64
juvenile_victim_count           float64
total_offender_count              int64
adult_offender_count            float64
juvenile_offender_count         float64
offender_race                    object
offender_ethnicity               object
victim_count                      int64
offense_name                     object
total_individual_victims        float64
location_name                    object
bias_desc                        object
victim_types                     object


## Create CSV Files for Database Tables

### Table: agency_types

In [6]:
# Create dataframe for agency_types and rename index
agency_types_df = pd.DataFrame(crime_data_df['agency_type_name'].unique(), columns=['agency_type'])
agency_types_df.index.name = 'agency_type_id'

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - agency_type: {agency_types_df['agency_type'].map(lambda x: len(x)).max()}")

agency_types_df

Max column size - agency_type: 21


Unnamed: 0_level_0,agency_type
agency_type_id,Unnamed: 1_level_1
0,City
1,County
2,University or College
3,Other
4,Other State Agency
5,State Police
6,Tribal
7,Federal


In [7]:
# Export agency_types dataframe to csv 
agency_types_df.to_csv('data/agency_types.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: states

In [7]:
# # Create dataframe for states
states_df = crime_data_df.loc[:, ['state_abbr', 'state_name', 'division_name', 'region_name']].groupby('state_abbr').first()
states_renamed_df = states_df.rename(columns={'state_name': 'state', 'division_name': 'division', 'region_name': 'region'})

# Print maximum size of each column for database schema
print(f"Max column size - state_name: {states_df['state_name'].map(lambda x: len(x)).max()}")
print(f"Max column size - division_name: {states_df['division_name'].map(lambda x: len(x)).max()}")
print(f"Max column size - region_name: {states_df['region_name'].map(lambda x: len(x)).max()}")

# Display sample data
states_renamed_df.head()

Max column size - state_name: 20
Max column size - division_name: 18
Max column size - region_name: 16


Unnamed: 0_level_0,state,division,region
state_abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,Alaska,Pacific,West
AL,Alabama,East South Central,South
AR,Arkansas,West South Central,South
AZ,Arizona,Mountain,West
CA,California,Pacific,West


In [71]:
# Export states dataframe to csv 
states_renamed_df.to_csv('data/states.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: agencies

In [8]:
# Create agencies csv
# Code Reference for as_index:
# https://stackoverflow.com/questions/21767900/how-to-move-pandas-data-from-index-to-column-after-multiple-groupby
agencies_df = crime_data_df.loc[:, ['pug_agency_name', 'pub_agency_unit', 'agency_type_name']].groupby('pug_agency_name', as_index=False, dropna=False).first()
agency_oris = crime_data_df.loc[:, ['ori', 'pug_agency_name', 'pub_agency_unit', 'agency_type_name']].groupby('ori', as_index=False, dropna=False).first()
agency_units = crime_data_df.loc[:, ['ori', 'pug_agency_name', 'pub_agency_unit', 'agency_type_name']].groupby(['pug_agency_name', 'pub_agency_unit'], as_index=False, dropna=False).first()
test = agency_units.sort_values(by=['ori'])

# Print maximum size of each column for database schema
print(len(agencies_df))
print(len(agency_oris))
print(len(agency_units))

# Display sample data

#test = agency_units[agency_units['agency_type_name'] == 'City'].sort_values(by='ori')
test = agency_oris[agency_oris['agency_type_name'] == 'City'].sort_values(by='ori')
print(len(test))
test.head(20)

5313
7669
5780
5123


Unnamed: 0,ori,pug_agency_name,pub_agency_unit,agency_type_name
0,AK0010100,Anchorage,,City
1,AK0010200,Fairbanks,,City
2,AK0010300,Juneau,,City
3,AK0010600,Nome,,City
4,AK0010700,Petersburg,,City
5,AK0010800,Seward,,City
6,AK0011300,Bethel,,City
7,AK0011600,Kotzebue,,City
8,AK0011700,Palmer,,City
9,AK0012000,Soldotna,,City


In [None]:
# Export dataframe to csv 
states_df.to_csv('data/states.csv')

# Confirm that export completed
print('Dataframe exported to csv')

### Table: population_groups

In [9]:
# Create dataframe for population groups and rename index
pop_groups_df = crime_data_df.loc[:, ['population_group_code', 'population_group_description']].groupby('population_group_code').first()
pop_groups_final_df = pop_groups_df.rename(columns={'population_group_description': 'population_group'})

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - population_group: {pop_groups_final_df['population_group'].map(lambda x: len(x)).max()}")

pop_groups_final_df

Max column size - population_group: 67


Unnamed: 0_level_0,population_group
population_group_code,Unnamed: 1_level_1
0,"Possessions (Puerto Rico, Guam, Virgin Islands..."
2,"Cities from 100,000 thru 249,999"
3,"Cities from 50,000 thru 99,999"
4,"Cities from 25,000 thru 49,999"
5,"Cities from 10,000 thru 24,999"
6,"Cities from 2,500 thru 9,999"
7,"Cities under 2,500"
1A,"Cities 1,000,000 or over"
1B,"Cities from 500,000 thru 999,999"
1C,"Cities from 250,000 thru 499,999"


In [20]:
# Export dataframe to csv 
pop_groups_df.to_csv('data/population_groups.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: race

In [10]:
# Create dataframe for race and rename index
race_df = pd.DataFrame(crime_data_df['offender_race'].unique(), columns=['race'])
race_df.index.name = 'race_id'
race_df

# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - race_id: {race_df['race'].map(lambda x: len(x)).max()}")

race_df

Max column size - race_id: 41


Unnamed: 0_level_0,race
race_id,Unnamed: 1_level_1
0,White
1,Unknown
2,Multiple
3,Asian
4,American Indian or Alaska Native
5,Black or African American
6,Native Hawaiian or Other Pacific Islander
7,Not Specified


In [11]:
# Organize and clean up dataframe 
# Order rows to match US census ordering
race_df = race_df.rename(index={5:1, 4: 2, 6: 4, 2: 5, 1: 6}).sort_index()
race_df.loc[6] = 'Unknown or Not Specified'
race_df = race_df.drop([7])
race_df

Unnamed: 0_level_0,race
race_id,Unnamed: 1_level_1
0,White
1,Black or African American
2,American Indian or Alaska Native
3,Asian
4,Native Hawaiian or Other Pacific Islander
5,Multiple
6,Unknown or Not Specified


In [64]:
# Export dataframe to csv 
race_df.to_csv('data/race.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: Victim type

In [130]:
# Create dataframe with index victim_type
victim_types_df = pd.DataFrame(crime_data_df['victim_types'].str.split(';')).explode('victim_types')
victim_types_df = victim_types_df.rename(columns={'victim_types': 'victim_type'})
victim_types_df = victim_types_df['victim_type'].unique()
victim_types_df = pd.DataFrame(victim_types_df, columns=['victim_type'])
victim_types_df.index.name = 'victim_type_id'
#victim_types_df = victim_types_df.rename(index={5:7, 7: 5, 6: 8, 8: 6}).sort_index()
#victim_types_df.loc[7] = 'Other or Unknown' 
#victim_types_df = victim_types_df.drop([8])
victim_types_df

Unnamed: 0_level_0,victim_type
victim_type_id,Unnamed: 1_level_1
0,Individual
1,Religious Organization
2,Government
3,Society/Public
4,Business
5,Other
6,Unknown
7,Financial Institution
8,Law Enforcement Officer


In [131]:
# Export dataframe to csv 
victim_types_df.to_csv('data/victim_types.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: Incident Victim Types

In [8]:
# Read victim_type csv to DataFrame
victim_type_df = pd.read_csv('data/victim_types.csv')
victim_type_df.head()

Unnamed: 0,victim_type_id,victim_type
0,0,Individual
1,1,Religious Organization
2,2,Government
3,3,Society/Public
4,4,Business


In [9]:
incident_victim_type = pd.DataFrame(crime_data_df['victim_types'].str.split(';')).explode('victim_types').rename(columns={'victim_types': 'victim_type'})
incident_victim_type['incident_id'] = incident_victim_type.index
incident_victim_type

Unnamed: 0_level_0,victim_type,incident_id
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1
141003,Individual,141003
141004,Individual,141004
141005,Individual,141005
141006,Individual,141006
141007,Individual,141007
...,...,...
1445630,Individual,1445630
1448544,Individual,1448544
1448545,Individual,1448545
1448546,Business,1448546


In [10]:
incident_victim_composite = incident_victim_type.merge(victim_type_df,on = 'victim_type', how = 'left')
incident_victim_composite = incident_victim_composite[['incident_id', 'victim_type_id']]
incident_victim_composite

Unnamed: 0,incident_id,victim_type_id
0,141003,0
1,141004,0
2,141005,0
3,141006,0
4,141007,0
...,...,...
90945,1445630,0
90946,1448544,0
90947,1448545,0
90948,1448546,4


In [11]:
incident_victim_composite.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90950 entries, 0 to 90949
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   incident_id     90950 non-null  int64
 1   victim_type_id  90950 non-null  int64
dtypes: int64(2)
memory usage: 1.4 MB


In [12]:
# Export dataframe to csv 
incident_victim_composite.to_csv('data/incident_victim_types.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Table: incidents

In [13]:
# create DataFrame and drop unneeded columns
incident_df = pd.DataFrame(crime_data_df)
incident_df['incident_id'] = incident_df.index
incident_df = incident_df[['incident_id', 'offender_race', 'pug_agency_name', 'ori', 'state_abbr', 'population_group_code', 'incident_date', 
                           'adult_victim_count', 'juvenile_victim_count','total_offender_count', 'adult_offender_count',
                           'juvenile_offender_count', 'victim_count', 'total_individual_victims']]

#in offender_race column change 'Unknown' and 'Not Specified' values to 'Unknown or Not Specified'
replace_values = {'Unknown': 'Unknown or Not Specified', 'Not Specified': 'Unknown or Not Specified'}
incident_df['offender_race'] = incident_df['offender_race'].replace(replace_values)
incident_df = incident_df.rename(columns={'pug_agency_name': 'agency'})
incident_df.head(10)

Unnamed: 0_level_0,incident_id,offender_race,agency,ori,state_abbr,population_group_code,incident_date,adult_victim_count,juvenile_victim_count,total_offender_count,adult_offender_count,juvenile_offender_count,victim_count,total_individual_victims
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
141003,141003,White,Anchorage,AK0010100,AK,1C,2009-02-17,,,1,,,1,1.0
141004,141004,Unknown or Not Specified,Anchorage,AK0010100,AK,1C,2009-06-30,,,0,,,1,1.0
141005,141005,White,Anchorage,AK0010100,AK,1C,2009-08-05,,,2,,,1,1.0
141006,141006,Multiple,Anchorage,AK0010100,AK,1C,2009-08-28,,,3,,,4,4.0
141007,141007,Asian,Anchorage,AK0010100,AK,1C,2009-08-28,,,2,,,1,1.0
141008,141008,White,Anchorage,AK0010100,AK,1C,2009-11-07,,,1,,,1,1.0
141009,141009,Multiple,Anchorage,AK0010100,AK,1C,2009-12-16,,,4,,,1,1.0
141010,141010,American Indian or Alaska Native,Bethel,AK0011300,AK,6,2009-05-04,,,1,,,1,1.0
141011,141011,Multiple,Bethel,AK0011300,AK,6,2009-10-05,,,2,,,1,1.0
136965,136965,White,Hoover,AL0011200,AL,3,2009-03-06,,,1,,,2,2.0


In [14]:
race_df = pd.read_csv('data/race.csv')
race_df = race_df.rename(columns={'race_id': 'offender_race_id',
                                  'race': 'offender_race'})
race_df

Unnamed: 0,offender_race_id,offender_race
0,0,White
1,1,Black or African American
2,2,American Indian or Alaska Native
3,3,Asian
4,4,Native Hawaiian or Other Pacific Islander
5,5,Multiple
6,6,Unknown or Not Specified
7,-1,All Races


In [15]:
merged_incident_df = incident_df.merge(race_df, on='offender_race', how = 'left')
merged_incident_df

Unnamed: 0,incident_id,offender_race,agency,ori,state_abbr,population_group_code,incident_date,adult_victim_count,juvenile_victim_count,total_offender_count,adult_offender_count,juvenile_offender_count,victim_count,total_individual_victims,offender_race_id
0,141003,White,Anchorage,AK0010100,AK,1C,2009-02-17,,,1,,,1,1.0,0
1,141004,Unknown or Not Specified,Anchorage,AK0010100,AK,1C,2009-06-30,,,0,,,1,1.0,6
2,141005,White,Anchorage,AK0010100,AK,1C,2009-08-05,,,2,,,1,1.0,0
3,141006,Multiple,Anchorage,AK0010100,AK,1C,2009-08-28,,,3,,,4,4.0,5
4,141007,Asian,Anchorage,AK0010100,AK,1C,2009-08-28,,,2,,,1,1.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89400,1445214,White,Cody,WY0150100,WY,6,2021-07-01,0.0,1.0,1,1.0,0.0,1,1.0,0
89401,1445630,Unknown or Not Specified,Riverton,WY0070200,WY,5,2021-09-11,1.0,0.0,0,,,1,1.0,6
89402,1448544,White,Green River,WY0190100,WY,5,2021-01-12,0.0,1.0,1,0.0,1.0,1,1.0,0
89403,1448545,White,Park,WY0150000,WY,8C,2021-10-06,1.0,0.0,1,1.0,0.0,1,1.0,0


In [16]:
agencies_df = pd.read_csv('data/agencies.csv')
agencies_df

Unnamed: 0,agency_id,agency
0,0,Anchorage
1,1,Bethel
2,2,Hoover
3,3,Mobile
4,4,Leesburg
...,...,...
5308,5308,Fall Creek
5309,5309,Monona
5310,5310,Two Rivers
5311,5311,Ronceverte


In [17]:
merged_incident_df = merged_incident_df.merge(agencies_df, on='agency', how = 'left')
merged_incident_df

Unnamed: 0,incident_id,offender_race,agency,ori,state_abbr,population_group_code,incident_date,adult_victim_count,juvenile_victim_count,total_offender_count,adult_offender_count,juvenile_offender_count,victim_count,total_individual_victims,offender_race_id,agency_id
0,141003,White,Anchorage,AK0010100,AK,1C,2009-02-17,,,1,,,1,1.0,0,0
1,141004,Unknown or Not Specified,Anchorage,AK0010100,AK,1C,2009-06-30,,,0,,,1,1.0,6,0
2,141005,White,Anchorage,AK0010100,AK,1C,2009-08-05,,,2,,,1,1.0,0,0
3,141006,Multiple,Anchorage,AK0010100,AK,1C,2009-08-28,,,3,,,4,4.0,5,0
4,141007,Asian,Anchorage,AK0010100,AK,1C,2009-08-28,,,2,,,1,1.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89400,1445214,White,Cody,WY0150100,WY,6,2021-07-01,0.0,1.0,1,1.0,0.0,1,1.0,0,4737
89401,1445630,Unknown or Not Specified,Riverton,WY0070200,WY,5,2021-09-11,1.0,0.0,0,,,1,1.0,6,1743
89402,1448544,White,Green River,WY0190100,WY,5,2021-01-12,0.0,1.0,1,0.0,1.0,1,1.0,0,1744
89403,1448545,White,Park,WY0150000,WY,8C,2021-10-06,1.0,0.0,1,1.0,0.0,1,1.0,0,338
