# Extract, Transform and Load data

In [2]:
# Import dependencies
import pandas as pd
import numpy as np

### Extract the hate crime data

In [3]:
# Read the data into a Pandas DataFrame
hcrime_df = pd.read_csv('../resources/fbi_hate_crime_data.csv')
hcrime_df.head()

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
1,44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
2,45,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
3,46,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
4,47,1991,AR0670000,Sevier,,County,AR,Arkansas,West South Central,South,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S


In [4]:
hcrime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226328 entries, 0 to 226327
Data columns (total 28 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   incident_id                   226328 non-null  int64  
 1   data_year                     226328 non-null  int64  
 2   ori                           226328 non-null  object 
 3   pug_agency_name               226328 non-null  object 
 4   pub_agency_unit               6563 non-null    object 
 5   agency_type_name              226328 non-null  object 
 6   state_abbr                    226328 non-null  object 
 7   state_name                    226328 non-null  object 
 8   division_name                 226328 non-null  object 
 9   region_name                   226328 non-null  object 
 10  population_group_code         225924 non-null  object 
 11  population_group_description  225924 non-null  object 
 12  incident_date                 226328 non-nul

In [5]:
# Remove data before 2009
crime_data_df = hcrime_df[hcrime_df['data_year'] > 2008]

# Display sample data
crime_data_df.head()

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
136923,141003,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,White,Not Specified,1,Simple Assault,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
136924,141004,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,Unknown,Not Specified,1,Intimidation,1.0,Residence/Home,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",Individual,S,S
136925,141005,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,White,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
136926,141006,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,Multiple,Not Specified,4,Aggravated Assault,4.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
136927,141007,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,Asian,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S


In [6]:
filtered_df = crime_data_df[crime_data_df["multiple_bias"]=="M"]
filtered_df["bias_desc"] =filtered_df["bias_desc"].str.split(';')
explode_df = filtered_df.explode("bias_desc", ignore_index=True)
explode_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["bias_desc"] =filtered_df["bias_desc"].str.split(';')


Unnamed: 0,incident_id,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,137625,2009,CA0411300,Redwood City,,City,CA,California,Pacific,West,...,Unknown,Not Specified,2,Destruction/Damage/Vandalism of Property;Intim...,1.0,Residence/Home,Anti-Black or African American,Individual,M,M
1,137625,2009,CA0411300,Redwood City,,City,CA,California,Pacific,West,...,Unknown,Not Specified,2,Destruction/Damage/Vandalism of Property;Intim...,1.0,Residence/Home,"Anti-Multiple Races, Group",Individual,M,M
2,141419,2009,CO0070400,Longmont,,City,CO,Colorado,Mountain,West,...,White,Not Specified,2,Burglary/Breaking & Entering;Destruction/Damag...,0.0,Restaurant,Anti-Hispanic or Latino,Business,M,M
3,141419,2009,CO0070400,Longmont,,City,CO,Colorado,Mountain,West,...,White,Not Specified,2,Burglary/Breaking & Entering;Destruction/Damag...,0.0,Restaurant,Anti-Other Race/Ethnicity/Ancestry,Business,M,M
4,137918,2009,CT0015500,West Hartford,,City,CT,Connecticut,New England,Northeast,...,White,Not Specified,2,Destruction/Damage/Vandalism of Property;Intim...,2.0,Residence/Home,Anti-Gay (Male),Individual,M,M
5,137918,2009,CT0015500,West Hartford,,City,CT,Connecticut,New England,Northeast,...,White,Not Specified,2,Destruction/Damage/Vandalism of Property;Intim...,2.0,Residence/Home,Anti-Other Race/Ethnicity/Ancestry,Individual,M,M
6,138428,2009,KY0470400,West Point,,City,KY,Kentucky,East South Central,South,...,White,Not Specified,4,Intimidation;Simple Assault,4.0,Highway/Road/Alley/Street/Sidewalk;Jail/Prison...,Anti-Asian,Individual,M,M
7,138428,2009,KY0470400,West Point,,City,KY,Kentucky,East South Central,South,...,White,Not Specified,4,Intimidation;Simple Assault,4.0,Highway/Road/Alley/Street/Sidewalk;Jail/Prison...,Anti-Black or African American,Individual,M,M
8,140343,2009,SC0130500,Pageland,,City,SC,South Carolina,South Atlantic,South,...,Unknown,Not Specified,2,Burglary/Breaking & Entering;Destruction/Damag...,2.0,Highway/Road/Alley/Street/Sidewalk;Residence/Home,Anti-Gay (Male),Individual,M,M
9,140343,2009,SC0130500,Pageland,,City,SC,South Carolina,South Atlantic,South,...,Unknown,Not Specified,2,Burglary/Breaking & Entering;Destruction/Damag...,2.0,Highway/Road/Alley/Street/Sidewalk;Residence/Home,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",Individual,M,M


In [7]:
explode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2411 entries, 0 to 2410
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   incident_id                   2411 non-null   int64  
 1   data_year                     2411 non-null   int64  
 2   ori                           2411 non-null   object 
 3   pug_agency_name               2411 non-null   object 
 4   pub_agency_unit               99 non-null     object 
 5   agency_type_name              2411 non-null   object 
 6   state_abbr                    2411 non-null   object 
 7   state_name                    2411 non-null   object 
 8   division_name                 2411 non-null   object 
 9   region_name                   2411 non-null   object 
 10  population_group_code         2358 non-null   object 
 11  population_group_description  2358 non-null   object 
 12  incident_date                 2411 non-null   object 
 13  adu

In [8]:
filtered_df2 = crime_data_df[crime_data_df["multiple_bias"]=="S"]
final_df = filtered_df2.merge(explode_df,how ='outer')

In [9]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90701 entries, 0 to 90700
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   incident_id                   90701 non-null  int64  
 1   data_year                     90701 non-null  int64  
 2   ori                           90701 non-null  object 
 3   pug_agency_name               90701 non-null  object 
 4   pub_agency_unit               2958 non-null   object 
 5   agency_type_name              90701 non-null  object 
 6   state_abbr                    90701 non-null  object 
 7   state_name                    90701 non-null  object 
 8   division_name                 90701 non-null  object 
 9   region_name                   90701 non-null  object 
 10  population_group_code         90270 non-null  object 
 11  population_group_description  90270 non-null  object 
 12  incident_date                 90701 non-null  object 
 13  a

In [10]:
# Export the DataFrame as a CSV file. 
final_df.to_csv("resources/hate_crime.csv", index=False)

## Creating Categories for Biases

In [10]:
Sexual_orientation = ['Anti-Gay (Male)','Anti-Lesbian (Female)','Anti-Lesbian, Gay, Bisexual, or Transgender (Mixed Group)','Anti-Bisexual','Anti-Heterosexual']
Religion = ['Anti-Jewish','Anti-Arab', 'Anti-Protestant', 'Anti-Other Religion','Anti-Islamic (Muslim)', 'Anti-Catholic', 'Anti-Multiple Religions, Group','Anti-Atheism/Agnosticism','Anti-Buddhist', 'Anti-Sikh', 'Anti-Other Christian', 'Anti-Hindu','Anti-Eastern Orthodox (Russian, Greek, Other)',"Anti-Jehovah's Witness", 'Anti-Church of Jesus Christ']
Race_Ethnicity_Ancestry = ['Anti-Black or African American', 'Anti-White','Anti-Arab','Anti-Asian','Anti-Hispanic or Latino', 'Anti-Multiple Races, Group','Anti-Other Race/Ethnicity/Ancestry','Anti-American Indian or Alaska Native','Anti-Native Hawaiian or Other Pacific Islander']
Gender_identity = ['Anti-Gender Non-Conforming','Anti-Transgender']
Gender = ['Anti-Female','Anti-Male']
Disability = ['Anti-Physical Disability', 'Anti-Mental Disability']

# len(Sexual_orientation)
# len(Religion)
# len(Race_Ethnicity_Ancestry)
# len(Gender_identity)
# len(Gender)
# len(Disability)

In [11]:
data = {'category': "Sexual_orientation", 'bias': Sexual_orientation}
sexual_orientation_df = pd.DataFrame(data, columns = ['category','bias'])
sexual_orientation_df

Unnamed: 0,category,bias
0,Sexual_orientation,Anti-Gay (Male)
1,Sexual_orientation,Anti-Lesbian (Female)
2,Sexual_orientation,"Anti-Lesbian, Gay, Bisexual, or Transgender (M..."
3,Sexual_orientation,Anti-Bisexual
4,Sexual_orientation,Anti-Heterosexual


In [12]:
data = {'category': "Religion", 'bias': Religion}
Religion_df = pd.DataFrame(data, columns = ['category','bias'])
Religion_df

Unnamed: 0,category,bias
0,Religion,Anti-Jewish
1,Religion,Anti-Arab
2,Religion,Anti-Protestant
3,Religion,Anti-Other Religion
4,Religion,Anti-Islamic (Muslim)
5,Religion,Anti-Catholic
6,Religion,"Anti-Multiple Religions, Group"
7,Religion,Anti-Atheism/Agnosticism
8,Religion,Anti-Buddhist
9,Religion,Anti-Sikh


In [13]:
data = {'category': "Race_Ethnicity_Ancestry", 'bias': Race_Ethnicity_Ancestry}
Race_Ethnicity_Ancestry_df = pd.DataFrame(data, columns = ['category','bias'])
Race_Ethnicity_Ancestry_df

Unnamed: 0,category,bias
0,Race_Ethnicity_Ancestry,Anti-Black or African American
1,Race_Ethnicity_Ancestry,Anti-White
2,Race_Ethnicity_Ancestry,Anti-Arab
3,Race_Ethnicity_Ancestry,Anti-Asian
4,Race_Ethnicity_Ancestry,Anti-Hispanic or Latino
5,Race_Ethnicity_Ancestry,"Anti-Multiple Races, Group"
6,Race_Ethnicity_Ancestry,Anti-Other Race/Ethnicity/Ancestry
7,Race_Ethnicity_Ancestry,Anti-American Indian or Alaska Native
8,Race_Ethnicity_Ancestry,Anti-Native Hawaiian or Other Pacific Islander


In [14]:
data = {'category': "Gender_identity", 'bias': Gender_identity}
Gender_identity_df = pd.DataFrame(data, columns = ['category','bias'])
Gender_identity_df

Unnamed: 0,category,bias
0,Gender_identity,Anti-Gender Non-Conforming
1,Gender_identity,Anti-Transgender


In [15]:
data = {'category': "Gender", 'bias': Gender}
Gender_df = pd.DataFrame(data, columns = ['category','bias'])
Gender_df

Unnamed: 0,category,bias
0,Gender,Anti-Female
1,Gender,Anti-Male


In [16]:
data = {'category': "Disability", 'bias': Disability}
Disability_df = pd.DataFrame(data, columns = ['category','bias'])
Disability_df

Unnamed: 0,category,bias
0,Disability,Anti-Physical Disability
1,Disability,Anti-Mental Disability


In [17]:
bias_df = sexual_orientation_df.merge(Religion_df,how ='outer').merge(Race_Ethnicity_Ancestry_df, how ="outer").merge(Gender_identity_df, how ="outer").merge(Gender_df, how ="outer").merge(Disability_df, how ="outer")

In [18]:
categories = bias_df["category"].unique().tolist()
# Create dataframe for category
category_df = pd.DataFrame(categories, columns=['category'])
category_df.index.name = 'category_id'
category_df

Unnamed: 0_level_0,category
category_id,Unnamed: 1_level_1
0,Sexual_orientation
1,Religion
2,Race_Ethnicity_Ancestry
3,Gender_identity
4,Gender
5,Disability


In [20]:
#Saving to csv file
category_df.to_csv("C:/Users/tnord/OneDrive/Desktop/Bootcamp_Projects/Group_Project_3/Project-3/etl/data/bias_categories.csv", index=True)

## Biases table

In [22]:
# Read the data into a Pandas DataFrame
categories= pd.read_csv('data/bias_categories.csv')
categories

Unnamed: 0,category_id,category
0,0,Sexual_orientation
1,1,Religion
2,2,Race_Ethnicity_Ancestry
3,3,Gender_identity
4,4,Gender
5,5,Disability


In [23]:
bias = final_df["bias_desc"].unique().tolist()

In [24]:
# Create dataframe for bias
biases_df = pd.DataFrame(bias, columns=['bias_desc'])
biases_df.index.name = 'bias_id'
biases_df

Unnamed: 0_level_0,bias_desc
bias_id,Unnamed: 1_level_1
0,Anti-American Indian or Alaska Native
1,"Anti-Lesbian, Gay, Bisexual, or Transgender (M..."
2,Anti-White
3,Anti-Lesbian (Female)
4,Anti-Black or African American
5,"Anti-Multiple Races, Group"
6,Anti-Gay (Male)
7,Anti-Jewish
8,Anti-Protestant
9,Anti-Bisexual


In [25]:
# Merge dataframes to create bias table with bias_id, bias, category_id
bias = biases_df.merge( bias_df,left_on  = "bias_desc",right_on ="bias", how = 'right').merge(categories,on  = "category", how = 'right')
bias = bias[['bias_desc','category_id']]
bias.index.name = 'bias_id'
bias

Unnamed: 0_level_0,bias_desc,category_id
bias_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Anti-Gay (Male),0
1,Anti-Lesbian (Female),0
2,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",0
3,Anti-Bisexual,0
4,Anti-Heterosexual,0
5,Anti-Jewish,1
6,Anti-Arab,1
7,Anti-Protestant,1
8,Anti-Other Religion,1
9,Anti-Islamic (Muslim),1


In [27]:
#Saving to csv file
bias.to_csv("data/bias.csv", index=True)

### Incidents and locations

In [28]:
crime_data_df["location_name"] =crime_data_df["location_name"].str.split(';')
explode_df = crime_data_df.explode("location_name", ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_data_df["location_name"] =crime_data_df["location_name"].str.split(';')


In [29]:
loc_df = explode_df["location_name"].unique()
# Create dataframe for bias
loc_df = pd.DataFrame(loc_df, columns=['location_name'])
loc_df.index.name = 'location_id'
loc_df

Unnamed: 0_level_0,location_name
location_id,Unnamed: 1_level_1
0,Highway/Road/Alley/Street/Sidewalk
1,Residence/Home
2,Restaurant
3,Convenience Store
4,Church/Synagogue/Temple/Mosque
5,Field/Woods
6,Commercial/Office Building
7,Other/Unknown
8,Parking/Drop Lot/Garage
9,Liquor Store


In [30]:
#Saving to csv file
loc_df.to_csv("data/location.csv", index=True)

## Incident locations

In [31]:
# Read the data into a Pandas DataFrame
location_df = pd.read_csv('data/location.csv')
location_df.head()

Unnamed: 0,location_id,location_name
0,0,Highway/Road/Alley/Street/Sidewalk
1,1,Residence/Home
2,2,Restaurant
3,3,Convenience Store
4,4,Church/Synagogue/Temple/Mosque


In [32]:
inci_df = explode_df[['incident_id','location_name']]

In [33]:
incident_locations = inci_df.merge( location_df,on  = "location_name", how = 'right')

In [34]:
incident_locations = incident_locations[["incident_id","location_id"]]
incident_locations.head()

Unnamed: 0,incident_id,location_id
0,141003,0
1,141005,0
2,141006,0
3,141007,0
4,141008,0


In [40]:
#Saving to csv file
incident_locations.to_csv("data/incident_location.csv", index=False)

## Incident Biases

In [37]:
# Read the data into a Pandas DataFrame
bias = pd.read_csv('data/bias.csv')
bias.head()

Unnamed: 0,bias_id,bias_desc,category_id
0,0,Anti-Gay (Male),0
1,1,Anti-Lesbian (Female),0
2,2,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",0
3,3,Anti-Bisexual,0
4,4,Anti-Heterosexual,0


In [38]:
incident = final_df[['incident_id','bias_desc']]
incident_biases = incident.merge( bias, on  = "bias_desc", how = 'right')
incident_biases = incident_biases[["incident_id","bias_id"]]
incident_biases.head()

Unnamed: 0,incident_id,bias_id
0,136967,0
1,137125,0
2,137131,0
3,137151,0
4,137156,0


In [39]:
#Saving to csv file
incident_biases.to_csv("data/incident_biases.csv", index=False)