Extract the subset of the data we want to look at.

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
sns.set()

# Read in the data

In [None]:
state_df = pd.read_csv("../data/viz/state_2022.csv")
state_df.head()

In [None]:
oecd_df = pd.read_csv("../data/viz/oecd.csv")
oecd_df = oecd_df.set_index('cty')
oecd_df.head()

In [None]:
oecd_mean = oecd_df['life expectancy'].mean()
jpn_val = oecd_df.loc['JPN', 'life expectancy']

# Plot

In [None]:
palette = sns.color_palette()

In [None]:
fig, ax = plt.subplots()
tdf = state_df[state_df['group'] != 'AIAN']
tdf = tdf[tdf['life expectancy'] > 0]
col_order = tdf.groupby('group').mean().sort_values('life expectancy')
for i, group in enumerate(col_order.index):
    ttdf = tdf[tdf['group'] == group]
    ax.scatter(ttdf['group'], ttdf['life expectancy'], color=palette[6], alpha=0.5)

ax.axhline(oecd_mean, alpha=0.5)
ax.annotate("Cohort\nmean", (1.2, oecd_mean - 3.5))

ax.axhline(jpn_val, alpha=0.5)
ax.annotate("JPN", (1.2, jpn_val));

a_tdf = state_df[state_df['group'] == 'Asian']
asian_count = len(a_tdf[a_tdf['life expectancy'] > oecd_mean])
ax.annotate(f"> cohort in\n{asian_count} states", (2.6, 95))
asian_count = len(a_tdf[a_tdf['life expectancy'] > jpn_val])
ax.annotate(f"> JPN in\n{asian_count} states", (2.5, 90))

h_tdf = state_df[state_df['group'] == 'Hispanic']
hispanic_count = len(h_tdf[h_tdf['life expectancy'] > oecd_mean])
ax.annotate(f"> cohort in\n{hispanic_count} states", (1.7, 95));
hispanic_count = len(h_tdf[h_tdf['life expectancy'] > jpn_val])
ax.annotate(f"> JPN in\n{hispanic_count} states", (1.5, 90));

# Life Expectancy vs. Income

In [None]:
reg_df = pd.read_csv("../data/viz/regression_2022.csv")

In [None]:
tdf = state_df[state_df['life expectancy'] > 0]
col_order = tdf.groupby('group').mean().sort_values('life expectancy')
fig, axs = plt.subplots(1, len(col_order), sharex=True, sharey=True, figsize=(12, 3))
for i, group in enumerate(col_order.index):
    ttdf = tdf[tdf['group'] == group]
    ax = axs[i]
    ax.scatter(ttdf['income'] / 1000, ttdf['life expectancy'], color=palette[6], alpha=0.4)
    ax.set_title(group)
    ttdf = reg_df[reg_df['group'] == group]
    ax.plot(ttdf['x'], ttdf['y'], color=palette[0])
    
    ax.axhline(oecd_mean, alpha=0.5)
    ax.axhline(jpn_val, alpha=0.5)
    label = "$r^2={:.2f}$".format(ttdf.iloc[0]['r2'])
    ax.annotate(label, (95, 100))
    label = "$slope={:.2f}$".format(ttdf.iloc[0]['slope'])
    ax.annotate(label, (95, 97))
    
    if i == 0:
        ax.set_ylabel('life expectancy')
        ax.set_xlabel('income (thousands)')
        ax.annotate("JPN", (110, jpn_val))
        ax.annotate("Cohort\nmean", (110, oecd_mean - 3.5))