#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 08
**CH08B How is life expectancy related to the average income of a country?**

using the worldbank-lifeexpectancy dataset

version 1.0 2021-05-05

In [None]:
import os
import sys
import warnings
from datetime import datetime

import matplotlib.font_manager
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from helper_functions import *
from mizani.formatters import log_format, percent_format
from mizani.transforms import log_trans
from scipy.stats import norm
from skimpy import skim

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

Read clean data

In [None]:
xc = pd.read_csv("https://osf.io/sh9mu/download")

select year

In [None]:
xc = xc.loc[lambda x: x["year"] == 2017]

 GDP total, log

In [None]:
xc["gdptot"] = xc["gdppc"] * xc["population"]
xc["lngdppc"] = np.log(xc["gdppc"])
xc["lngdptot"] = np.log(xc["gdptot"])

In [None]:
skim(xc.filter(["lifeexp", "gdppc", "gdptot", "lngdppc", "lngdptot"]))

### Figure 8.3 The distribution of GDP per capita

(a) Histogram of GDP per capita

In [None]:
fig1 = sns.histplot(
    xc,
    x="gdppc",
    stat="percent",
    binwidth=3,
    binrange=(0, 120),
    edgecolor="white",
    color="blue",
)
plt.ylabel("Percent", size=12)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.xlim(0, 120)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=100, decimals=0))
plt.yticks(np.arange(0, 21, 4))
add_margin(fig1, x=0.01, y=0)
plt.show()

(b) Histogram of ln(GDP per capita)

In [None]:
fig1 = sns.histplot(
    xc,
    x="lngdppc",
    stat="percent",
    binwidth=0.15,
    binrange=(0, 5),
    edgecolor="white",
    color="blue",
)
plt.ylabel("Percent", size=12)
plt.xlabel("ln(GDP per capita (thousand US dollars))", size=12)
plt.xlim(0, 5)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=100, decimals=0))
plt.yticks(np.arange(0, 11, 2))
add_margin(fig1, x=0.01, y=0)
plt.show()

 LEVEL-LEVEL REGRESSION

In [None]:
reg3 = smf.ols(formula="lifeexp ~ gdppc", data=xc)
reg3.fit().summary()


### Figure 8.4 Life expectancy and GDP per capita

In [None]:
sns.scatterplot(xc, x="gdppc", y="lifeexp", color="blue", size=11, legend=None)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.xlim(0, 120)
plt.ylim(50, 100)
plt.yticks(np.arange(50, 101, 5))
add_margin(fig1, x=0.01, y=0.01)
plt.show()

In [None]:
figscatter1 = sns.regplot(
    xc,
    x="gdppc",
    y="lifeexp",
    ci=None,
    scatter_kws={"color": "blue", "s": 11},
    line_kws={"color": "red"},
)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.xlim(0, 120)
plt.ylim(50, 100)
plt.yticks(np.arange(50, 101, 5))
add_margin(figscatter1, x=0.005, y=0.005)
plt.show()

LOG GDP PER CAPITA

In [None]:
reg4 = smf.ols(formula="lifeexp ~ lngdppc", data=xc)
reg4.fit().summary()


### Figure 8.5 Life expectancy and GDP per capita

(a) Life expectancy and ln(GDP per capita)

In [None]:
figscatter2 = sns.regplot(
    xc,
    x="lngdppc",
    y="lifeexp",
    ci=None,
    scatter_kws={"color": "blue", "s": 11},
    line_kws={"color": "red"},
)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xlim(-0.5, 4.8)
plt.ylim(50, 85)
plt.xticks(np.arange(-0.5, 4.6, 0.5))
plt.yticks(np.arange(50, 86, 5))
add_margin(figscatter2, x=0.05, y=0.01)
plt.show()

(b) Life expectancy and ln(GDP per capita),
(labels are thousand dollars)

In [None]:
sns.regplot(
    xc,
    x="gdppc",
    y="lifeexp",
    ci=None,
    logx=True,
    scatter_kws={"color": "blue", "s": 11},
    line_kws={"color": "red"},
)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xscale("log")
plt.ylim(50, 85)
plt.xticks((1, 2, 5, 10, 20, 50, 100), labels=(1, 2, 5, 10, 20, 50, 100))
plt.yticks(np.arange(50, 86, 5))
plt.show()

### TOTAL GDP

Level-level regression

In [None]:
reg1 = smf.ols(formula="lifeexp ~ gdppc", data=xc)
reg1.fit().summary()


### Figure 8.6 Life expectancy and total GDP

(a) Life expectancy and total GDP

In [None]:
figscatter1 = sns.regplot(
    xc,
    x="gdptot",
    y="lifeexp",
    ci=None,
    scatter_kws={"color": "blue", "s": 11},
    line_kws={"color": "red"},
)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xlabel("Total GDP(thousand US dollars)", size=12)
plt.xlim(0, 24000)
plt.ylim(50, 85)
plt.yticks(np.arange(50, 81, 5))
add_margin(figscatter1, x=0.01, y=0)
plt.show()

In [None]:
reg2 = smf.ols(formula="lifeexp ~ lngdptot", data=xc)
reg2.fit().summary()


(b) Life expectancy and ln total GDP

In [None]:
sns.regplot(
    xc,
    x="gdptot",
    y="lifeexp",
    ci=None,
    logx=True,
    scatter_kws={"color": "blue", "s": 11},
    line_kws={"color": "red"},
)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xscale("log")
plt.ylim(50, 85)
#plt.xticks((1, 2, 5, 10, 20, 50, 100), labels=(1, 2, 5, 10, 20, 50, 100))
plt.yticks(np.arange(50, 86, 5))
plt.show()

### GDP PER CAPITA PIECEWISE LINEAR SPLINE


In [None]:
cutoff = 50
cutoff_ln = np.log(cutoff)

In [None]:
reg5 = smf.ols(formula="lifeexp ~ lspline(lngdppc,cutoff_ln)", data=xc)
reg5.fit().summary()


In [None]:
xc["e3"] = reg5.fit().resid
xc["sppred"] = reg5.fit().predict()


### Figure 8.7 Life expectancy and GDP per capita: scatterplot and nonlinear regression

(a) Piecewise linear spline

In [None]:
fig, ax = plt.subplots(figsize=(6.4, 4.8))
p1 = sns.scatterplot(data=xc, x="lngdppc", y="lifeexp", color="blue", ax=ax)
p2 = sns.lineplot(data=xc, x="lngdppc", y="sppred", color="red", ax=ax)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.ylabel("Life expectancy  (years)", size=12)
plt.ylim(50, 85)
plt.xticks(np.log([1, 2, 5, 10, 20, 50, 100]), labels=(1, 2, 5, 10, 20, 50, 100))
plt.yticks(np.arange(50, 86, 5))
plt.axvline(np.log(cutoff), linestyle=":", linewidth=1, color="green")
plt.show()

### QUADRATIC IN LEVEL-LOG REGRESSION

In [None]:
xc["lngdppc_sq"] = xc["lngdppc"].pow(2)
reg6 = smf.ols(formula="lifeexp ~ lngdppc+lngdppc_sq", data=xc)
reg6.fit().summary()

In [None]:
xc["e6"] = reg6.fit().resid

(b) Quadratic function

In [None]:
sns.regplot(
    xc,
    x="lngdppc",
    y="lifeexp",
    ci=None,
    order=2,
    scatter_kws={"color": "blue", "s": 11},
    line_kws={"color": "red"},
)
plt.xlabel("GDP per capita (thousand US dollars)", size=12)
plt.ylabel("Life expectancy  (years)", size=12)
plt.ylim(50, 85)
plt.xticks(np.log([1, 2, 5, 10, 20, 50, 100]), labels=(1, 2, 5, 10, 20, 50, 100))
plt.yticks(np.arange(50, 86, 5))
plt.show()

### WEIGHTED AND UNWEIGHTED REGRESSION

In [None]:
reg7 = smf.ols(formula="lifeexp ~ lngdppc", data=xc)
reg7.fit().summary()


In [None]:
reg7 = smf.wls(formula="lifeexp ~ lngdppc", data=xc, weights=xc.population)
reg7.fit().summary()


### Figure 8.9 Life expectancy and log GDP per capita: unweighted and weighted regressions

(a) Unweighted

In [None]:
fig_unw = sns.regplot(
    xc,
    x="gdppc",
    y="lifeexp",
    ci=None,
    logx=True,
    scatter_kws={"color": "blue", "s": 11},
    line_kws={"color": "red"},
)
plt.xlabel("GDP per capita, thousand US dollars (ln scale)", size=12)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xscale("log")
plt.ylim(50, 85)
plt.xticks((1, 2, 5, 10, 20, 50, 100), labels=(1, 2, 5, 10, 20, 50, 100))
plt.yticks(np.arange(50, 86, 5))
add_margin(fig_unw, x=0, y=0.008)
plt.show()

(b) Weighted

In [None]:
fig_unw = sns.regplot(
    xc,
    x="gdppc",
    y="lifeexp",
    ci=None,
    logx=True,
    scatter_kws={"color": "blue", "s": xc["population"], "alpha": 0.5},
    line_kws={"color": "red"},
)
plt.xlabel("GDP per capita, thousand US dollars (ln scale)", size=12)
plt.ylabel("Life expectancy  (years)", size=12)
plt.xscale("log")
plt.ylim(50, 85)
plt.xticks((1, 2, 5, 10, 20, 50, 100), labels=(1, 2, 5, 10, 20, 50, 100))
plt.yticks(np.arange(50, 86, 5))
add_margin(fig_unw, x=0, y=0.008)
plt.annotate("USA", (65, 79))
plt.annotate("China", (10, 81))
plt.annotate("India", (7, 64))
plt.show()