---
## ETL Project: Fatal Police Shootings
---
##### Authors: 
 - Cecilia Acosta
 - Luis Bonilla
 - Felipe Murillo
##### Created: June 16, 2020
###### Data Source: https://www.kaggle.com/kwullum/fatal-police-shootings-in-the-us/
##### Data Description: 
 - The Washington Post tracked information about fatal police shottings in the United States. It tracks data about specific incidents: description of victim, threat level, victim mental stability, if the suspect fled the scene of the crime, and if there was a body cam availble to record the incident.
 - To learn more about where these incidents occur, US census data on poverty rate, high school graduation rate, median household income, and racial demographics (by U.S. city) is provided
----

#### Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import os
from sqlalchemy import create_engine,func,inspect

# Request PID to log into Postgres
from config import POSTGRES_PASSWORD, POSTGRES_USERNAME

#### Map Data Files/Locations

In [2]:
# Provide data names and locations
medHouseIncome = os.path.join(".","data/MedianHouseholdIncome2015.csv")
#
povLevel = os.path.join(".","data/PercentagePeopleBelowPovertyLevel.csv")
#
highSchool = os.path.join(".","data/PercentOver25CompletedHighSchool.csv")
#
raceByCity = os.path.join(".","data/ShareRaceByCity.csv")
#
policeKillings = os.path.join(".","data/PoliceKillingsUS.csv")


### Extract data and Transform

#### Import Police Killings Data

In [3]:
# Import police killings CSV into a dataframe
policeKillings_df = pd.read_csv(policeKillings,encoding = "ISO-8859-1")

# Display dataframe
policeKillings_df.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


#### Import/transform high school completion rate data & create city/state table

In [4]:
# Import high school completion rates by city (CSV)
highSchool_df = pd.read_csv(highSchool,encoding = "ISO-8859-1")

highSchool_df.head()

Unnamed: 0,Geographic Area,City,percent_completed_hs
0,AL,Abanda CDP,21.2
1,AL,Abbeville city,69.1
2,AL,Adamsville city,78.9
3,AL,Addison town,81.4
4,AL,Akron town,68.6


In [5]:
# Create city & state table

# Initialize dataframe
city_df = pd.DataFrame(columns = ['City','Geographic Area','Combo'])

# Populate city name and state
city_df["City"] = highSchool_df["City"]
city_df["Geographic Area"] = highSchool_df["Geographic Area"]

# Combine city/state into a single string to give this combo a unique identifier
# Note: this is done because many states have cities with the same name
city_df['Combo']= city_df["City"] +" "+ city_df["Geographic Area"]
city_df.head(3)

Unnamed: 0,City,Geographic Area,Combo
0,Abanda CDP,AL,Abanda CDP AL
1,Abbeville city,AL,Abbeville city AL
2,Adamsville city,AL,Adamsville city AL


In [6]:
# Reset the index
city_df = city_df.reset_index()

# Rename the index to city_id
city_df = city_df.rename(columns={"index":"city_id"})

# Display City table
city_df.head(3)

Unnamed: 0,city_id,City,Geographic Area,Combo
0,0,Abanda CDP,AL,Abanda CDP AL
1,1,Abbeville city,AL,Abbeville city AL
2,2,Adamsville city,AL,Adamsville city AL


In [7]:
# Merge city and state for highSchool_df to faciliate merge with city table
highSchool_df['Combo']= highSchool_df["City"] +" "+ highSchool_df["Geographic Area"]

# Combine tables by city/state combos
m_highSchool_df=highSchool_df.merge(city_df, left_on="Combo", right_on="Combo")

In [8]:
# Show merged table
m_highSchool_df.head(3)

Unnamed: 0,Geographic Area_x,City_x,percent_completed_hs,Combo,city_id,City_y,Geographic Area_y
0,AL,Abanda CDP,21.2,Abanda CDP AL,0,Abanda CDP,AL
1,AL,Abbeville city,69.1,Abbeville city AL,1,Abbeville city,AL
2,AL,Adamsville city,78.9,Adamsville city AL,2,Adamsville city,AL


In [9]:
# Pull needed data only
new_highSchool_df = m_highSchool_df[["city_id","percent_completed_hs"]]

# Set city_id to index
new_highSchool_df = new_highSchool_df.set_index("city_id")

# Display new data frame
new_highSchool_df.head(3)

Unnamed: 0_level_0,percent_completed_hs
city_id,Unnamed: 1_level_1
0,21.2
1,69.1
2,78.9


#### Import/transform poverty rate data

In [10]:
# Import poverty rate by city CSV into dataframe
povLevel_df = pd.read_csv(povLevel,encoding = "ISO-8859-1")

povLevel_df.head()

Unnamed: 0,Geographic Area,City,poverty_rate
0,AL,Abanda CDP,78.8
1,AL,Abbeville city,29.1
2,AL,Adamsville city,25.5
3,AL,Addison town,30.7
4,AL,Akron town,42.0


In [11]:
# Merge city and state for highSchool_df to faciliate merge with city table
povLevel_df['Combo']= povLevel_df["City"] +" "+ povLevel_df["Geographic Area"]

# Combine tables by city/state combos
m_povLevel_df=povLevel_df.merge(city_df, left_on="Combo", right_on="Combo")

In [12]:
# Only select desired data
new_povLevel_df = m_povLevel_df[["city_id","poverty_rate"]]

# Set city_id to index
new_povLevel_df = new_povLevel_df.set_index("city_id")

# Display new data frame
new_povLevel_df.head()

Unnamed: 0_level_0,poverty_rate
city_id,Unnamed: 1_level_1
0,78.8
1,29.1
2,25.5
3,30.7
4,42.0


#### Import/transform median income by city

In [13]:
# Import median income by city CSV into dataframe
medHouseIncome_df = pd.read_csv(medHouseIncome,encoding = "ISO-8859-1")

medHouseIncome_df

Unnamed: 0,Geographic Area,City,Median Income
0,AL,Abanda CDP,11207
1,AL,Abbeville city,25615
2,AL,Adamsville city,42575
3,AL,Addison town,37083
4,AL,Akron town,21667
...,...,...,...
29317,WY,Woods Landing-Jelm CDP,
29318,WY,Worland city,41523
29319,WY,Wright town,77114
29320,WY,Yoder town,37500


In [14]:
# Merge city and state for highSchool_df to faciliate merge with city table
medHouseIncome_df['Combo']= medHouseIncome_df["City"] +" "+ medHouseIncome_df["Geographic Area"]

# Combine tables by city/state combos
m_medHouseIncome_df=medHouseIncome_df.merge(city_df, how="left", left_on="Combo", right_on="Combo")

In [15]:
# We have 2 NaN Values
m_medHouseIncome_df.sort_values("city_id")

Unnamed: 0,Geographic Area_x,City_x,Median Income,Combo,city_id,City_y,Geographic Area_y
0,AL,Abanda CDP,11207,Abanda CDP AL,0.0,Abanda CDP,AL
1,AL,Abbeville city,25615,Abbeville city AL,1.0,Abbeville city,AL
2,AL,Adamsville city,42575,Adamsville city AL,2.0,Adamsville city,AL
3,AL,Addison town,37083,Addison town AL,3.0,Addison town,AL
4,AL,Akron town,21667,Akron town AL,4.0,Akron town,AL
...,...,...,...,...,...,...,...
29319,WY,Wright town,77114,Wright town WY,29326.0,Wright town,WY
29320,WY,Yoder town,37500,Yoder town WY,29327.0,Yoder town,WY
29321,WY,Y-O Ranch CDP,,Y-O Ranch CDP WY,29328.0,Y-O Ranch CDP,WY
5676,HI,Hawaii,69515,Hawaii HI,,,


In [16]:
# Drop entries with NAN city_id's
m_medHouseIncome_df= m_medHouseIncome_df.dropna(axis=0, subset=['city_id'])

In [17]:
# Select desired data only
new_medHouseIncome_df = m_medHouseIncome_df[["city_id","Median Income"]]

# Convert city_id into integer
new_medHouseIncome_df["city_id"]= new_medHouseIncome_df["city_id"].astype(np.int64)

# Rename columns
new_medHouseIncome_df = new_medHouseIncome_df.rename(columns={"Median Income":"median_income"})

# Set Index to city_id
new_medHouseIncome_df = new_medHouseIncome_df.set_index("city_id")

# Display dataframe
new_medHouseIncome_df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0_level_0,median_income
city_id,Unnamed: 1_level_1
0,11207
1,25615
2,42575
3,37083
4,21667
...,...
29324,
29325,41523
29326,77114
29327,37500


#### Import/transform city demographics

In [18]:
# Import race by city CSV into dataframe
raceByCity_df = pd.read_csv(raceByCity,encoding = "ISO-8859-1")
raceByCity_df

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda CDP,67.2,30.2,0,0,1.6
1,AL,Abbeville city,54.4,41.4,0.1,1,3.1
2,AL,Adamsville city,52.3,44.9,0.5,0.3,2.3
3,AL,Addison town,99.1,0.1,0,0.1,0.4
4,AL,Akron town,13.2,86.5,0,0,0.3
...,...,...,...,...,...,...,...
29263,WY,Woods Landing-Jelm CDP,95.9,0,0,2.1,0
29264,WY,Worland city,89.9,0.3,1.3,0.6,16.6
29265,WY,Wright town,94.5,0.1,1.4,0.2,6.2
29266,WY,Yoder town,97.4,0,0,0,4


In [19]:
# Merge city and state for raceByCity_df to faciliate merge with city table
raceByCity_df['Combo']= raceByCity_df["City"] +" "+ raceByCity_df["Geographic area"]

In [20]:
# Combine tables by city/state combos
m_raceByCity_df=raceByCity_df.merge(city_df,how='left',left_on="Combo", right_on="Combo")

In [21]:
# Drop entries with NAN city_id's
m_raceByCity_df= m_raceByCity_df.dropna(axis=0, subset=['city_id'])

In [22]:
# Reorder dataframe
new_raceByCity_df = m_raceByCity_df[["city_id","share_white","share_black","share_native_american","share_asian","share_hispanic"]]

# Convert city_id into integer
new_raceByCity_df["city_id"]= new_raceByCity_df["city_id"].astype(np.int64)

# Rename columns to be more representative
new_raceByCity_df = new_raceByCity_df.rename(columns=
                                          {"share_white":"white",
                                           "share_black":"black",
                                           "share_native_american":"native_american",
                                           "share_asian":"asian",
                                           "share_hispanic":"hispanic"
                                          })

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [23]:
# Set city_id to be index
new_raceByCity_df = new_raceByCity_df.set_index("city_id")

# Show new datframe
new_raceByCity_df.head()

Unnamed: 0_level_0,white,black,native_american,asian,hispanic
city_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,67.2,30.2,0.0,0.0,1.6
1,54.4,41.4,0.1,1.0,3.1
2,52.3,44.9,0.5,0.3,2.3
3,99.1,0.1,0.0,0.1,0.4
4,13.2,86.5,0.0,0.0,0.3


#### Transform city table

In [24]:
# Display City table
city_df.head(3)

Unnamed: 0,city_id,City,Geographic Area,Combo
0,0,Abanda CDP,AL,Abanda CDP AL
1,1,Abbeville city,AL,Abbeville city AL
2,2,Adamsville city,AL,Adamsville city AL


In [25]:
# Save information about city type: city, CDP, town or village
city_split = city_df["City"].str.rsplit(" ", 1, expand = True)
city_name = city_split[0]
city_type = city_split[1]

In [26]:
# Create city, city type, state table
city_df["city"] = city_name
city_df["city type"] = city_type

# Remove unwanted columns
del city_df["City"]
del city_df["Combo"]

# Set city_id as index
city_df=city_df.set_index("city_id")

In [27]:
# Rename column to something more meaningful
city_df= city_df.rename(columns={"Geographic Area":"state"})

In [28]:
# Display dataframe
city_df.head()

Unnamed: 0_level_0,state,city,city type
city_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,AL,Abanda,CDP
1,AL,Abbeville,city
2,AL,Adamsville,city
3,AL,Addison,town
4,AL,Akron,town


#### Design the Entity-Relationship Diagram

![Relational Diagram](data/QuickDBD-Police_Killings.png)

#### Load data into PostgreSQL

In [29]:
# Define Postgres database name
POSTGRES_DBNAME = 'police_db'

# Setup connection to Postgres
postgres_str = ('postgresql://{username}:{password}@localhost:5432/{dbname}'.format(
    username=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    dbname=POSTGRES_DBNAME))

# Create the connection
engine = create_engine(postgres_str)

In [30]:
# Confirm database is empty tables
engine.table_names()

['race_by_city',
 'police_killings',
 'cities',
 'hs_completion_by_city',
 'poverty_level_by_city',
 'median_income_by_city']

In [43]:
# Create police killings table
policeKillings_df.to_sql(name="police_killings", con=engine, if_exists='append', index=False)

In [44]:
# Confirm dataframe was copied obver to PostGres
pd.read_sql_query("SELECT * FROM police_killings", con=engine).head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [45]:
# Create city table
city_df.to_sql(name="cities", con=engine, if_exists='append', index=True)

In [46]:
# Confirm dataframe was copied obver to PostGres
pd.read_sql_query("SELECT * FROM cities", con=engine).head()

Unnamed: 0,city_id,state,city,city type
0,0,AL,Abanda,CDP
1,1,AL,Abbeville,city
2,2,AL,Adamsville,city
3,3,AL,Addison,town
4,4,AL,Akron,town


In [47]:
# Create race by city table
new_raceByCity_df.to_sql(name="race_by_city", con=engine, if_exists='append', index=True)

In [48]:
# Confirm dataframe was copied obver to PostGres
pd.read_sql_query("SELECT * FROM race_by_city", con=engine).head()

Unnamed: 0,city_id,white,black,native_american,asian,hispanic
0,0,67.2,30.2,0.0,0.0,1.6
1,1,54.4,41.4,0.1,1.0,3.1
2,2,52.3,44.9,0.5,0.3,2.3
3,3,99.1,0.1,0.0,0.1,0.4
4,4,13.2,86.5,0.0,0.0,0.3


In [49]:
# Create high school completion table
new_highSchool_df.to_sql(name="hs_completion_by_city", con=engine, if_exists='append', index=True)

In [50]:
# Confirm dataframe was copied obver to PostGres
pd.read_sql_query("SELECT * FROM hs_completion_by_city", con=engine).head()

Unnamed: 0,city_id,percent_completed_hs
0,0,21.2
1,1,69.1
2,2,78.9
3,3,81.4
4,4,68.6


In [51]:
# Create high school completion table
new_povLevel_df.to_sql(name="poverty_level_by_city", con=engine, if_exists='append', index=True)

In [52]:
# Confirm dataframe was copied obver to PostGres
pd.read_sql_query("SELECT * FROM poverty_level_by_city", con=engine).head()

Unnamed: 0,city_id,poverty_rate
0,0,78.8
1,1,29.1
2,2,25.5
3,3,30.7
4,4,42.0


In [53]:
# Create median income by household per city table
new_medHouseIncome_df.to_sql(name="median_income_by_city", con=engine, if_exists='append', index=True)

In [54]:
# Confirm dataframe was copied obver to PostGres
pd.read_sql_query("SELECT * FROM median_income_by_city", con=engine).head()

Unnamed: 0,city_id,median_income
0,0,11207
1,1,25615
2,2,42575
3,3,37083
4,4,21667
