#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 08
**CH08B How is life expectancy related to the average income of a country?**

using the worldbank-lifeexpectancy dataset

version 1.0 2021-05-05

In [None]:
import os
import sys
import warnings
from datetime import datetime

import matplotlib.font_manager
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from helper_functions import *
from mizani.formatters import log_format, percent_format
from mizani.transforms import log_trans
from plotnine import *
from scipy.stats import norm

warnings.filterwarnings("ignore")

Read clean data

In [None]:
xc = pd.read_csv("https://osf.io/sh9mu/download")

In [None]:
xc

select year

In [None]:
xc = xc.loc[lambda x: x["year"] == 2017]

 GDP total, log

In [None]:
xc["gdptot"] = xc["gdppc"] * xc["population"]
xc["lngdppc"] = np.log(xc["gdppc"])
xc["lngdptot"] = np.log(xc["gdptot"])

In [None]:
xc.loc[:, ["lifeexp", "gdppc", "gdptot", "lngdppc", "lngdptot"]].describe()


### Figure 8.3 The distribution of GDP per capita

(a) Histogram of GDP per capita

In [None]:
(
    ggplot(xc, aes(x="gdppc", y="stat(count)/sum(stat(count))"))
    + geom_histogram(
        binwidth=3,
        boundary=0,
        color="white",
        fill="blue",
        size=0.25,
        alpha=0.8,
        show_legend=False,
        na_rm=True,
    )
    + labs(x="GDP per capita (thousand US dollars)", y="Percent")
    + expand_limits(x=0.01, y=0.01)
    + scale_x_continuous(expand=(0.01, 0.01), limits=(0, 120), breaks=np.arange(0, 121, 20))
    + scale_y_continuous(
        labels=percent_format(),
        breaks=np.arange(0, 0.201, 0.04),
        limits=(0, 0.2),
        expand=(0.0, 0.0),
    )
    + theme_bw()
)


(b) Histogram of ln(GDP per capita)

In [None]:
(
    ggplot(xc, aes(x="lngdppc", y="stat(count)/sum(stat(count))"))
    + geom_histogram(
        binwidth=0.15,
        boundary=0,
        color="white",
        fill="blue",
        size=0.25,
        alpha=0.8,
        show_legend=False,
        na_rm=True,
    )
    + labs(x="ln(GDP per capita (thousand US dollars))", y="Percent")
    + expand_limits(x=0.01, y=0.01)
    + scale_x_continuous(
        expand=(0.01, 0.01),
        limits=(0, 5),
        breaks=np.arange(0, 5.1, 0.5),
        )
    + scale_y_continuous(
        expand=(0.0, 0.0),
        limits=(0, 0.1),
        breaks=np.arange(0, 0.11, 0.02),
        labels=percent_format(),
    )
    + theme_bw()
)

 LEVEL-LEVEL REGRESSION

In [None]:
reg3 = smf.ols(formula="lifeexp ~ gdppc", data=xc)
reg3.fit().summary()


### Figure 8.4 Life expectancy and GDP per capita

In [None]:
(
    ggplot(xc, aes(x="gdppc", y="lifeexp"))
    + geom_point(color="blue")
    + geom_smooth(color="red", method="lm", se=False)
    + coord_cartesian(xlim=(0, 120), ylim=(50, 100))
    + expand_limits(x=0.01, y=0.01)
    + scale_x_continuous(
        expand=(0.01, 0.01),
        limits=(0, 120),
        breaks=np.arange(0, 121, 20),
    )
    + scale_y_continuous(expand=(0.01, 0.01), breaks=np.arange(50, 101, 5))
    + labs(
        x="GDP per capita (thousand US dollars)",
        y="Life expectancy  (years)",
        title="Relationship between GDP per capita (thousand US dollars)\nand Life expectancy (years)",
    )
    + theme_bw()
)

LOG GDP PER CAPITA

In [None]:
reg4 = smf.ols(formula="lifeexp ~ lngdppc", data=xc)
reg4.fit().summary()


### Figure 8.5 Life expectancy and GDP per capita

(a) Life expectancy and ln(GDP per capita)

In [None]:
(
    ggplot(xc, aes(x="lngdppc", y="lifeexp"))
    + geom_point(color="blue")
    + geom_smooth(color="red", method="lm", se=False)
    + coord_cartesian(xlim=(-0.5, 4.8), ylim=(50, 85))
    + scale_x_continuous(breaks=np.arange(-0.5, 4.6, 0.5))
    + scale_y_continuous(expand=(0.01, 0.01), breaks=np.arange(50, 86, 5))
    + labs(x="ln(GDP per capita, thousand US dollars) ", y="Life expectancy  (years)")
    + theme_bw()
)


(b) Life expectancy and ln(GDP per capita),
(labels are thousand dollars)

In [None]:
(
    ggplot(xc, aes(x="gdppc", y="lifeexp"))
    + geom_point(color="blue")
    + geom_smooth(color="red", method="lm", se=False)
    + coord_cartesian(ylim=(50, 85))
    + scale_x_continuous(
        trans=log_trans(),
        breaks=(0.1, 0.5, 1, 2, 5, 10, 20, 50, 100),
        labels=log_format(),
    )
    + scale_y_continuous(expand=(0.01, 0.01), breaks=np.arange(50, 86,5))
    + labs(
        x="GDP per capita, thousand US dollars (ln scale) ",
        y="Life expectancy  (years)",
    )
    + theme_bw()
)

### TOTAL GDP

Level-level regression

In [None]:
reg1 = smf.ols(formula="lifeexp ~ gdppc", data=xc)
reg1.fit().summary()


### Figure 8.6 Life expectancy and total GDP

(a) Life expectancy and total GDP

In [None]:
(
    ggplot(xc, aes(x="gdptot", y="lifeexp"))
    + geom_point(color="blue")
    + geom_smooth(color="red", method="lm", se=False)
    + coord_cartesian(xlim=(0, 24000), ylim=(50, 85))
    + expand_limits(x=0.01, y=0.01)
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 24000), breaks=np.arange(0, 24000, 4000)
    )
    + scale_y_continuous(expand=(0.01, 0.01), limits=(50, 85), breaks=np.arange(50, 85, 5))
    + labs(x="Total GDP  (billion US dollars)", y="Life expectancy  (years)")
    + theme_bw()
)


In [None]:
reg2 = smf.ols(formula="lifeexp ~ lngdptot", data=xc)
reg2.fit().summary()


(b) Life expectancy and ln total GDP

In [None]:
(
    ggplot(xc, aes(x="gdptot", y="lifeexp"))
    + geom_point(color="blue")
    + geom_smooth(color="red", method="lm", se=False)
    + coord_cartesian(ylim=(50, 85))
    + scale_x_continuous(
        trans=log_trans(),
        breaks=(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 10000),
        labels=log_format(),
    )
    + scale_y_continuous(expand=(0.01, 0.01), breaks=(50, 55, 60, 65, 70, 75, 80, 85))
    + labs(x="Total GDP (in ln scale))", y="Life expectancy  (years)")
    + theme_bw()
)

### GDP PER CAPITA PIECEWISE LINEAR SPLINE


In [None]:
cutoff = 50
cutoff_ln = np.log(cutoff)

In [None]:
reg5 = smf.ols(formula="lifeexp ~ lspline(lngdppc,cutoff_ln)", data=xc)
reg5.fit().summary()


In [None]:
xc["e3"] = reg5.fit().resid
xc["sppred"] = reg5.fit().predict()


### Figure 8.7 Life expectancy and GDP per capita: scatterplot and nonlinear regression

(a) Piecewise linear spline

In [None]:
(
    ggplot(xc, aes(x="gdppc", y="lifeexp"))
    + geom_point(color="blue")
    + geom_line(xc, aes(x="gdppc", y="sppred"), color="red", size=1)
    + geom_vline(xintercept=cutoff, color="green", size=0.5, linetype="dotted")
    + coord_cartesian(ylim=(50, 85))
    + scale_x_continuous(
        trans=log_trans(),
        breaks=(0.1, 0.5, 1, 2, 5, 10, 20, 50, 100),
        labels=log_format(),
    )
    + scale_y_continuous(expand=(0.01, 0.01), breaks=np.arange(50, 86, 5))
    + labs(
        x="GDP per capita, thousand US dollars (ln scale) ",
        y="Life expectancy  (years)",
    )
    + theme_bw()
)

### QUADRATIC IN LEVEL-LOG REGRESSION

In [None]:
xc["lngdppc_sq"] = xc["lngdppc"].pow(2)
reg6 = smf.ols(formula="lifeexp ~ lngdppc+lngdppc_sq", data=xc)
reg6.fit().summary()

In [None]:
xc["e6"] = reg6.fit().resid

(b) Quadratic function

In [None]:
(
    ggplot(xc, aes(x="gdppc", y="lifeexp"))
    + geom_point(color="blue")
    + stat_smooth(color="red", method="lm", formula="y ~ poly(x,2)", se=False, size=1)
    + coord_cartesian(ylim=(50, 85))
    + scale_x_continuous(
        trans=log_trans(),
        breaks=(0.1, 0.5, 1, 2, 5, 10, 20, 50, 100),
        labels=log_format(),
    )
    + scale_y_continuous(expand=(0.01, 0.01), breaks=np.arange(50, 86, 5))
    + labs(
        x="GDP per capita, thousand US dollars (ln scale) ",
        y="Life expectancy  (years)",
    )
    + theme_bw()
)

### WEIGHTED AND UNWEIGHTED REGRESSION

In [None]:
reg7 = smf.ols(formula="lifeexp ~ lngdppc", data=xc)
reg7.fit().summary()


In [None]:
reg7 = smf.wls(formula="lifeexp ~ lngdppc", data=xc, weights=xc.population)
reg7.fit().summary()


### Figure 8.9 Life expectancy and log GDP per capita: unweighted and weighted regressions

(a) Unweighted

In [None]:
(
    ggplot(xc, aes(x="gdppc", y="lifeexp"))
    + geom_point(color="blue")
    + stat_smooth(color="red", method="lm", se=False, size=1)
    + coord_cartesian(ylim=(50, 85))
    + scale_x_continuous(
        trans=log_trans(),
        breaks=(0.1, 0.5, 1, 2, 5, 10, 20, 50, 100),
        labels=log_format(),
    )
    + scale_y_continuous(expand=(0.01, 0.01), breaks=np.arange(50, 86, 5))
    + labs(
        x="GDP per capita, thousand US dollars (ln scale) ",
        y="Life expectancy  (years)",
    )
    + theme_bw()
)

(b) Weighted

In [None]:
(
    ggplot(xc, aes(x="gdppc", y="lifeexp"))
    + geom_point(xc, aes(size="population"), color="blue", alpha=0.6, show_legend=False)
    + scale_fill_identity()
    + scale_color_identity()
    + geom_smooth(
        aes(weight="population"), method="lm", color="red", se=False, size=0.7
    )
    + scale_size(range=(1, 20))
    + coord_cartesian(ylim=(50, 85))
    + scale_x_continuous(
        trans=log_trans(),
        breaks=(0.1, 0.5, 1, 2, 5, 10, 20, 50, 100),
        labels=log_format(),
    )
    + scale_y_continuous(expand=(0.01, 0.01), breaks=np.arange(50, 86, 5))
    + labs(
        x="GDP per capita, thousand US dollars (ln scale) ", y="Life expectancy (years)"
    )
    + theme_bw()
    + annotate("text", x=70, y=80, label="USA", size=10)
    + annotate("text", x=10, y=82, label="China", size=10)
    + annotate("text", x=7, y=63, label="India", size=10)
)