In [1]:

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from statistics import mean

# Study data files
census_metadata_path = "../ETL/ETL-Results/census_data.csv"
health_metadata_path = "../ETL/ETL-Results/four_mort_measures.csv"

# Read the mouse data and the study results
health_metadata = pd.read_csv(health_metadata_path)
census_metadata = pd.read_csv(census_metadata_path)

# Combine the data into a single dataset
race_data_df = pd.merge(health_metadata, census_metadata, how = "inner", right_on = ["State Abbr", "County Name"], 
                        left_on = ["State", "County Name"])

# Display the data table for preview
race_data_df.columns

Index(['Facility ID', 'Facility Name', 'Address', 'City', 'State_x',
       'ZIP Code', 'County Name', 'Measure ID', 'Measure Name', 'Denominator',
       'Score', 'Lower Estimate', 'Higher Estimate', 'Start Date', 'End Date',
       'State_y', 'State Abbr', 'Household Median Income',
       'Family's Median Income', 'Total Population', 'Percent Poverty',
       'Percent Veteran', 'Percent Married', 'Percent Bachelor',
       'Percent One Race White', 'Percent One Race Black+',
       'Percent One Race American Indian+', 'Percent One Race Asian',
       'Percent One Race Hawaiian+', 'Percent One Race Some Other',
       'Percent Two Race Or More', 'State Code', 'County Code'],
      dtype='object')

In [2]:
race_data_df = race_data_df.rename(columns={"State_y":"State", "Score": "Mortality Percentage"})

race_merged = race_data_df[['Facility Name', 'County Name','City', 'State Abbr', 'Total Population','Measure Name', 'Mortality Percentage','Percent One Race White', 'Percent One Race Black+',]]

In [3]:
hist_agg = race_merged.groupby(['County Name', 'State Abbr']).agg({'Percent One Race Black+':'mean'})
hist_agg_white = race_merged.groupby(['County Name', 'State Abbr']).agg({'Percent One Race White':'mean'})

In [4]:
race_merged = race_merged.groupby(['County Name', 'State Abbr', 'Measure Name', 'Total Population']).agg({
    'Mortality Percentage' : 'mean',
    'Percent One Race White' : 'mean',
    'Percent One Race Black+' : 'mean'
}).reset_index()

In [5]:
race_merged[race_merged['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean()] 
race_merged[race_merged['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()] 
race_merged[(race_merged['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()) & (race_merged['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean())] 

Unnamed: 0,County Name,State Abbr,Measure Name,Total Population,Mortality Percentage,Percent One Race White,Percent One Race Black+
86,ALCORN,MS,Death rate for COPD patients,37090,7.80,84.4,12.1
87,ALCORN,MS,Death rate for heart attack patients,37090,15.90,84.4,12.1
88,ALCORN,MS,Death rate for heart failure patients,37090,12.70,84.4,12.1
89,ALCORN,MS,Death rate for pneumonia patients,37090,18.80,84.4,12.1
280,BALDWIN,AL,Death rate for COPD patients,212830,8.90,86.2,9.3
...,...,...,...,...,...,...,...
6289,UNION,KY,Death rate for heart failure patients,14638,12.70,83.2,13.3
6353,VANDERBURGH,IN,Death rate for COPD patients,181291,9.75,85.0,9.5
6354,VANDERBURGH,IN,Death rate for heart attack patients,181291,13.55,85.0,9.5
6355,VANDERBURGH,IN,Death rate for heart failure patients,181291,12.05,85.0,9.5


In [6]:
race_dropped = race_merged[~((race_merged['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()) & (race_merged['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean()))]


In [7]:
white_df = race_dropped[race_dropped['Percent One Race White'] > hist_agg_white['Percent One Race White'].mean()]
black_df = race_dropped[race_dropped['Percent One Race Black+'] > hist_agg['Percent One Race Black+'].mean()] 



In [8]:
white_black_df = pd.concat([white_df, black_df])
white_black_df.columns

Index(['County Name', 'State Abbr', 'Measure Name', 'Total Population',
       'Mortality Percentage', 'Percent One Race White',
       'Percent One Race Black+'],
      dtype='object')

In [9]:
black_COPD_df = black_df[black_df['Measure Name'] == 'Death rate for COPD patients']
white_COPD_df = white_df[white_df['Measure Name'] == 'Death rate for COPD patients']

In [10]:
def get_urban_rural(x):
    if x <= 50000: 
        return 'RURAL'
    else: 
        return 'URBAN'

In [11]:
white_black_df['urban_rural_category'] = white_black_df['Total Population'].apply(get_urban_rural)
white_black_df[white_black_df['urban_rural_category'] == 'URBAN']

Unnamed: 0,County Name,State Abbr,Measure Name,Total Population,Mortality Percentage,Percent One Race White,Percent One Race Black+,urban_rural_category
8,ADA,ID,Death rate for COPD patients,456849,8.366667,90.5,1.4,URBAN
9,ADA,ID,Death rate for heart attack patients,456849,12.400000,90.5,1.4,URBAN
10,ADA,ID,Death rate for heart failure patients,456849,11.566667,90.5,1.4,URBAN
11,ADA,ID,Death rate for pneumonia patients,456849,12.900000,90.5,1.4,URBAN
26,ADAMS,IL,Death rate for COPD patients,66085,8.300000,92.9,4.1,URBAN
...,...,...,...,...,...,...,...,...
6907,WYANDOTTE,KS,Death rate for pneumonia patients,164861,12.400000,58.8,22.3,URBAN
6966,YORK,SC,Death rate for COPD patients,265872,10.600000,74.2,19.2,URBAN
6967,YORK,SC,Death rate for heart attack patients,265872,14.200000,74.2,19.2,URBAN
6968,YORK,SC,Death rate for heart failure patients,265872,15.200000,74.2,19.2,URBAN


In [12]:
rural_urban_df = white_black_df
rural_urban_df

Unnamed: 0,County Name,State Abbr,Measure Name,Total Population,Mortality Percentage,Percent One Race White,Percent One Race Black+,urban_rural_category
8,ADA,ID,Death rate for COPD patients,456849,8.366667,90.5,1.4,URBAN
9,ADA,ID,Death rate for heart attack patients,456849,12.400000,90.5,1.4,URBAN
10,ADA,ID,Death rate for heart failure patients,456849,11.566667,90.5,1.4,URBAN
11,ADA,ID,Death rate for pneumonia patients,456849,12.900000,90.5,1.4,URBAN
12,ADAIR,KY,Death rate for pneumonia patients,19222,14.300000,94.9,3.1,RURAL
...,...,...,...,...,...,...,...,...
6942,YAZOO,MS,Death rate for pneumonia patients,28565,12.900000,36.9,59.6,RURAL
6966,YORK,SC,Death rate for COPD patients,265872,10.600000,74.2,19.2,URBAN
6967,YORK,SC,Death rate for heart attack patients,265872,14.200000,74.2,19.2,URBAN
6968,YORK,SC,Death rate for heart failure patients,265872,15.200000,74.2,19.2,URBAN


In [15]:
def mortality_ttest(measure_name, urban_rural_df):
    rural_copd_mortality_percent = rural_urban_df[(rural_urban_df['Measure Name'] == measure_name) & 
            (rural_urban_df['urban_rural_category']== 'RURAL')]
    urban_copd_mortality_percent = rural_urban_df[(rural_urban_df['Measure Name'] == measure_name) & 
            (rural_urban_df['urban_rural_category']== 'URBAN')]
    return st.ttest_ind(rural_copd_mortality_percent['Mortality Percentage'], urban_copd_mortality_percent['Mortality Percentage'], equal_var=False)

print(mortality_ttest('Death rate for COPD patients', rural_urban_df))
print(mortality_ttest('Death rate for pneumonia patients', rural_urban_df)) 
print(mortality_ttest('Death rate for heart failure patients', rural_urban_df))
print(mortality_ttest('Death rate for heart attack patients', rural_urban_df))

Ttest_indResult(statistic=-1.1290733610434243, pvalue=0.259033234502412)
Ttest_indResult(statistic=1.3873500405319792, pvalue=0.16551189293129867)
Ttest_indResult(statistic=6.020884974265895, pvalue=2.145756436292528e-09)
Ttest_indResult(statistic=2.1995850041302556, pvalue=0.028577996648351322)
