In [4]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from statistics import mean

# Study data files
census_metadata_path = "../ETL/ETL-Results/census_data.csv"
health_metadata_path = "../ETL/ETL-Results/four_mort_measures.csv"

# Read the mouse data and the study results
health_metadata = pd.read_csv(health_metadata_path)
census_metadata = pd.read_csv(census_metadata_path)

# Combine the data into a single dataset
race_data_df = pd.merge(health_metadata, census_metadata, how = "inner", right_on = ["State Abbr", "County Name"], 
                        left_on = ["State", "County Name"])

# Display the data table for preview
race_data_df.columns


Index(['Facility ID', 'Facility Name', 'Address', 'City', 'State_x',
       'ZIP Code', 'County Name', 'Measure ID', 'Measure Name', 'Denominator',
       'Score', 'Lower Estimate', 'Higher Estimate', 'Start Date', 'End Date',
       'State_y', 'State Abbr', 'Household Median Income',
       'Family's Median Income', 'Total Population', 'Percent Poverty',
       'Percent Veteran', 'Percent Married', 'Percent Bachelor',
       'Percent One Race White', 'Percent One Race Black+',
       'Percent One Race American Indian+', 'Percent One Race Asian',
       'Percent One Race Hawaiian+', 'Percent One Race Some Other',
       'Percent Two Race Or More', 'State Code', 'County Code'],
      dtype='object')

In [31]:
race_data_df = race_data_df.rename(columns={"State_y":"State", "Score": "Mortality Percentage"})
race_merged = race_data_df[['Facility Name', 'County Name','City', 'State Abbr', 'Total Population','Measure Name', 'Mortality Percentage','Percent One Race White', 'Percent One Race Black+',
       'Percent One Race American Indian+', 'Percent One Race Asian',
       'Percent One Race Hawaiian+', 'Percent One Race Some Other',
       'Percent Two Race Or More']]
race_merged.head()

Unnamed: 0,Facility Name,County Name,City,State Abbr,Total Population,Measure Name,Mortality Percentage,Percent One Race White,Percent One Race Black+,Percent One Race American Indian+,Percent One Race Asian,Percent One Race Hawaiian+,Percent One Race Some Other,Percent Two Race Or More
0,MISSION COMMUNITY HOSPITAL,LOS ANGELES,PANORAMA CITY,CA,10081570,Death rate for COPD patients,5.1,51.3,8.1,0.7,14.6,0.3,21.0,4.0
1,CEDARS-SINAI MEDICAL CENTER,LOS ANGELES,LOS ANGELES,CA,10081570,Death rate for COPD patients,5.2,51.3,8.1,0.7,14.6,0.3,21.0,4.0
2,CENTINELA HOSPITAL MEDICAL CENTER,LOS ANGELES,INGLEWOOD,CA,10081570,Death rate for heart failure patients,5.3,51.3,8.1,0.7,14.6,0.3,21.0,4.0
3,"L A DOWNTOWN MEDICAL CENTER, LLC",LOS ANGELES,LOS ANGELES,CA,10081570,Death rate for COPD patients,5.3,51.3,8.1,0.7,14.6,0.3,21.0,4.0
4,"WEST COVINA MEDICAL CENTER, INC",LOS ANGELES,WEST COVINA,CA,10081570,Death rate for COPD patients,5.5,51.3,8.1,0.7,14.6,0.3,21.0,4.0


In [20]:
# “The Census Bureau identifies two types of urban areas: Urbanized Areas (UAs) of 50,000 or more people; Urban Clusters (UCs) 
# of at least 2,500 and less than 50,000 people. ‘Rural’ encompasses all population, housing, and territory not included 
# within an urban area.” -- https://www.washingtonpost.com/politics/the-federal-definition-of-rural--times-15/2013/06/08/a39e46a8-cd4a-11e2-ac03-178510c9cc0a_story.html

# The Census recognizes that "densely settled communities outside the boundaries of large incorporated municipalities were just 
# as ''urban'' as the densely settled population inside those boundaries." Their definition does not follow city or county 
# boundaries, and so it is sometimes difficult to determine whether a particular area is considered urban or rural.
# https://www.hrsa.gov/rural-health/about-us/definition/index.html#:~:text=All%20counties%20that%20are%20not,
# as%20either%20Metro%20or%20Micro.

In [21]:
hist_agg = race_merged.groupby(['County Name', 'State Abbr']).agg({'Percent One Race Black+':'mean'})
hist_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Percent One Race Black+
County Name,State Abbr,Unnamed: 2_level_1
ABBEVILLE,SC,27.6
ACADIA,LA,17.4
ACCOMACK,VA,28.6
ADA,ID,1.4
ADAIR,KY,3.1


In [22]:
hist_agg['Percent One Race Black+'].mean()

8.943256653134867

In [23]:
hist_agg_white = race_merged.groupby(['County Name', 'State Abbr']).agg({'Percent One Race White':'mean'})
hist_agg_white.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Percent One Race White
County Name,State Abbr,Unnamed: 2_level_1
ABBEVILLE,SC,69.9
ACADIA,LA,79.3
ACCOMACK,VA,68.2
ADA,ID,90.5
ADAIR,KY,94.9


In [24]:
hist_agg_white['Percent One Race White'].mean()

83.03157419936852

In [35]:
race_merged[race_merged['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean()] 
race_merged[race_merged['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()] 
race_merged[(race_merged['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()) & (race_merged['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean())]
race_dropped = race_merged[~((race_merged['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()) & (race_merged['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean()))]

In [41]:
white_df = race_dropped[race_dropped['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()]
black_df = race_dropped[race_dropped['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean()] 

In [42]:
black_df['Total Population'].dtypes

dtype('int64')

In [43]:
black_rural_df = black_df[black_df['Total Population'] <= 50,000]
black_rural_df

TypeError: '(304      False
305      False
306      False
307      False
308      False
         ...  
13196    False
13198    False
13206    False
13212    False
13215    False
Name: Total Population, Length: 5058, dtype: bool, 0)' is an invalid key