Extract the subset of the data we want to look at.

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
sns.set()

# Read in the data

In [None]:
raw_df = pd.read_csv("../data/division.csv")

In [None]:
raw_df.head()

In [None]:
raw_df.describe()

# Reformat the data

Let us change the structure of the data to make it easier to handle

In [None]:
place_cols = ['FIPS', 'State', 'Division']
le_cols = [c for c in raw_df.columns if c.startswith('Life Expectancy (') and c.endswith(')')]
pop_cols = ['Population', 
            '# Black', '% Black', 
            '# American Indian & Alaska Native', '% American Indian & Alaska Native', 
            '# Asian', '% Asian',
            '# Native Hawaiian/Other Pacific Islander',
            '% Native Hawaiian/Other Pacific Islander', 
            '# Hispanic', '% Hispanic',
            '# Non-Hispanic White', '% Non-Hispanic White',
            '% Female']

In [None]:
place_df = raw_df[place_cols]

In [None]:
le_df = raw_df[le_cols]
le_df.columns = ['AIAN', 'Asian', 'Black', 'Hispanic', 'White']
le_df = place_df.join(le_df)
le_df = le_df.set_index(place_cols)
le_df.head()

In [None]:
pop_df = raw_df[pop_cols]
pop_df = pop_df.drop(columns=["# Native Hawaiian/Other Pacific Islander", "% Native Hawaiian/Other Pacific Islander", "% Female"])
pop_df.columns = ['Population', 
                  '# Black', '% Black', 
                  '# AIAN', '% AIAN',
                  '# Asian', '% Asian',
                  '# Hispanic', '% Hispanic',
                  '# White', '% White']
pop_df = place_df.join(pop_df)
pop_df = pop_df.set_index(place_cols)
pop_df.head()

In [None]:
def to_group_df(group):
    df = pop_df[[f"# {group}", f"% {group}"]].join(le_df[group])
    df.columns = ['population', 'percentage', 'life expectancy']
    df['group'] = group
    return df

In [None]:
aian_df = to_group_df('AIAN')
aian_df.head()

In [None]:
groups = ['AIAN', 'Asian', 'Black', 'Hispanic', 'White']
df = pd.concat([to_group_df(g) for g in groups])
df = df.sort_index()
df.head()

# Summarize data by state

Looking at county-level data is a bit overwhelming here, so let us summarize to the state level.

In [None]:
results = []
for name, gdf in df.reset_index().groupby(["State", "group"]):
    tdf = gdf.dropna()
    pop_total = tdf['population'].sum()
    pop_frac =  tdf['population'] / pop_total
    result = {'State': name[0], 'group': name[1], 
              'population': pop_total,
              'life expectancy': (tdf['life expectancy'] * pop_frac).sum()}
    results.append(result)
state_df = pd.DataFrame(results)

In [None]:
state_df.head()

There is something strange happening with the model for the AIAN. 

In [None]:
tdf = state_df[state_df['life expectancy'] > 0]
sns.boxplot(x='group', y='life expectancy', data=tdf)

So let is exclude that group for our the moment.

In [None]:
fig, ax = plt.subplots()
tdf = state_df[state_df['group'] != 'AIAN']
tdf = tdf[tdf['life expectancy'] > 0]
col_order = tdf.groupby('group').mean().sort_values('life expectancy')
for i, group in enumerate(col_order.index):
    ttdf = tdf[tdf['group'] == group]
    ax.scatter(ttdf['group'], ttdf['life expectancy'], alpha=0.7)

# Group summaries

In [None]:
results = []
for name, gdf in state_df.reset_index().groupby("group"):
    tdf = gdf.dropna()
    pop_total = tdf['population'].sum()
    pop_frac =  tdf['population'] / pop_total
    result = {'group': name, 
              'population': pop_total,
              'life expectancy': (tdf['life expectancy'] * pop_frac).sum()}
    results.append(result)
nat_group_df = pd.DataFrame(results)

In [None]:
nat_group_df

In [None]:
pop_total = nat_group_df['population'].sum()
pop_frac =  nat_group_df['population'] / pop_total
(nat_group_df['life expectancy'] * pop_frac).sum()


In [None]:
tdf.set_index(['State', 'group']).unstack().describe()

In [None]:
state_df[state_df['group'] == 'Asian'].sort_values("life expectancy")

In [None]:
a_tdf = state_df[state_df['group'] == 'Asian']
len(a_tdf[a_tdf['life expectancy'] > 84.6])

In [None]:
h_tdf = state_df[state_df['group'] == 'Hispanic']
len(h_tdf[h_tdf['life expectancy'] > 84.6])

In [None]:
h_tdf.sort_values('life expectancy')

# Further questions

Compare our state sumaries to the ones from the Robert Wood Johnson Foundation / U of Wisc.

In [None]:
rwjf_state_df = pd.read_csv("../data/state.csv")
rwjf_state_df = rwjf_state_df[['FIPS', 'State', 'Life Expectancy', 'Population']]

In [None]:
rwjf_state_df.head()

The **mean** of the life expectancy values from the original data is pretty close to what is shown in _Our World in Data_.

In [None]:
print("Life Expectancy as mean of states", rwjf_state_df['Life Expectancy'].mean())

If we compute the population-weighted mean, then our result is about 0.5 years little heigher.

In [None]:
pop_total = rwjf_state_df['Population'].sum()
pop_frac =  rwjf_state_df['Population'] / pop_total
result = {'population': pop_total,
          'life expectancy': (rwjf_state_df['Life Expectancy'] * pop_frac).sum()}
print("Life Expectancy as weighted mean of states", (rwjf_state_df['Life Expectancy'] * pop_frac).sum())

The state summaries we get by summarizing the higher-resolution data are a bit different, and about 1y higher on average.

In [None]:
results = []
for name, gdf in df.reset_index().groupby(["State"]):
    tdf = gdf.dropna()
    pop_total = tdf['population'].sum()
    pop_frac =  tdf['population'] / pop_total
    result = {'State': name,
              'population': pop_total,
              'life expectancy': (tdf['life expectancy'] * pop_frac).sum()}
    results.append(result)
state_summary_df = pd.DataFrame(results)

In [None]:
state_summary_df.head()

In [None]:
(state_summary_df['life expectancy'] - rwjf_state_df['Life Expectancy']).describe()

In [None]:
pop_total = state_summary_df['population'].sum()
pop_frac =  state_summary_df['population'] / pop_total
result = {'population': pop_total,
          'life expectancy': (state_summary_df['life expectancy'] * pop_frac).sum()}
print("Life Expectancy", (state_summary_df['life expectancy'] * pop_frac).sum())