# Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import math
import seaborn as sns

## Load Clean Data

### Population

In [2]:
pop = pd.read_csv('../data/pop_clean.csv', index_col=0)

pop.head()

Unnamed: 0,Afghanistan,Angola,Albania,Andorra,United Arab Emirates,Argentina,Armenia,Antigua and Barbuda,Australia,Austria,...,Uzbekistan,St. Vincent and the Grenadines,Venezuela,Vietnam,Vanuatu,Samoa,Yemen,South Africa,Zambia,Zimbabwe
1800,3280000.0,1570000.0,400000.0,2650.0,40200.0,534000.0,413000.0,37000.0,200000.0,3000000.0,...,1920000.0,25800.0,1000000.0,4000000.0,27800.0,47300.0,2590000.0,1450000.0,747000.0,1090000.0
1801,3280000.0,1570000.0,402000.0,2650.0,40200.0,520000.0,413000.0,37000.0,205000.0,3020000.0,...,1920000.0,25800.0,978000.0,4100000.0,27800.0,47300.0,2590000.0,1450000.0,758000.0,1090000.0
1802,3280000.0,1570000.0,404000.0,2650.0,40200.0,506000.0,413000.0,37000.0,211000.0,3040000.0,...,1920000.0,25800.0,957000.0,4200000.0,27800.0,47300.0,2590000.0,1460000.0,770000.0,1090000.0
1803,3280000.0,1570000.0,405000.0,2650.0,40200.0,492000.0,413000.0,37000.0,216000.0,3050000.0,...,1920000.0,25800.0,936000.0,4310000.0,27800.0,47300.0,2590000.0,1460000.0,782000.0,1090000.0
1804,3280000.0,1570000.0,407000.0,2650.0,40200.0,479000.0,413000.0,37000.0,222000.0,3070000.0,...,1920000.0,25800.0,916000.0,4410000.0,27800.0,47300.0,2590000.0,1470000.0,794000.0,1090000.0


In [3]:
countries_pop = pop.columns.tolist()

### Life Expectency

In [4]:
le = pd.read_csv('../data/le_clean.csv', index_col=0)

le.head()

Unnamed: 0,Afghanistan,Angola,Albania,United Arab Emirates,Argentina,Armenia,Antigua and Barbuda,Australia,Austria,Azerbaijan,...,Uzbekistan,St. Vincent and the Grenadines,Venezuela,Vietnam,Vanuatu,Samoa,Yemen,South Africa,Zambia,Zimbabwe
1800,28.2,27.0,35.4,30.7,33.2,34.0,33.5,34.0,34.4,29.2,...,26.9,26.0,32.2,32.0,24.3,25.4,23.4,33.5,32.6,33.7
1801,28.2,27.0,35.4,30.7,33.2,34.0,33.5,34.0,34.4,29.2,...,26.9,26.0,32.2,32.0,24.3,25.4,23.4,33.5,32.6,33.7
1802,28.2,27.0,35.4,30.7,33.2,34.0,33.5,34.0,34.4,29.2,...,26.9,26.0,32.2,32.0,24.3,25.4,23.4,33.5,32.6,33.7
1803,28.2,27.0,35.4,30.7,33.2,34.0,33.5,34.0,34.4,29.2,...,26.9,26.0,32.2,32.0,24.3,25.4,23.4,33.5,32.6,33.7
1804,28.2,27.0,35.4,30.7,33.2,34.0,33.5,34.0,34.4,29.2,...,26.9,26.0,32.2,32.0,24.3,25.4,23.4,33.5,32.6,33.7


In [5]:
countries_le = le.columns.tolist()

### GNI per Capita

In [6]:
gni = pd.read_csv('../data/gni_clean.csv', index_col=0)

gni.head()

Unnamed: 0,Afghanistan,Angola,Albania,United Arab Emirates,Argentina,Armenia,Antigua and Barbuda,Australia,Austria,Azerbaijan,...,Uzbekistan,St. Vincent and the Grenadines,Venezuela,Vietnam,Vanuatu,Samoa,Yemen,South Africa,Zambia,Zimbabwe
1800,207.0,517.0,207.0,738.0,794.0,111.0,650.0,763.0,1940.0,33.0,...,199.0,546.0,459.0,226.0,416.0,373.0,197.0,800.0,213.0,443.0
1801,207.0,519.0,207.0,740.0,797.0,111.0,650.0,769.0,1950.0,33.0,...,199.0,546.0,457.0,226.0,417.0,373.0,198.0,791.0,214.0,444.0
1802,207.0,522.0,207.0,743.0,799.0,111.0,650.0,773.0,1960.0,33.0,...,199.0,546.0,454.0,226.0,417.0,373.0,198.0,782.0,215.0,444.0
1803,207.0,524.0,207.0,746.0,802.0,111.0,650.0,777.0,1970.0,33.0,...,199.0,546.0,452.0,226.0,417.0,373.0,199.0,773.0,215.0,445.0
1804,207.0,525.0,207.0,749.0,805.0,111.0,650.0,782.0,1970.0,33.0,...,199.0,546.0,450.0,226.0,417.0,373.0,199.0,765.0,215.0,445.0


In [7]:
countries_gni = gni.columns.tolist()

## Countries in Dataset

In [8]:
countries = list(set(countries_pop) & set(countries_le) & set(countries_gni))

print(len(countries))

184


## Normalized Change

To compare changes in life expectancy, population, and gni per capita for different countries and account for their starting values:  I'm going to use a 'normalized change'.

I'm going to take the difference between a final value and an initial value and divide by the initial value. If the normalized change in life expectancy is $1.20$, the life expectancy in that country has increased by $120\%$. If the initial life expectancy was $40$ years, people are now living $40 \times 1.2 = 48$ years longer with a new life expectancy of $40 + 48 = 88$ years.

$$ \text{normalized change} 
= 
\frac{ \langle\text{final value}\rangle - \langle\text{initial value}\rangle }{ \langle\text{initial value} \rangle }
$$

where

$$
\langle \text{final value} \rangle = \text{average life expectancy for the last twenty years of data}
$$
$$
\langle \text{initial value} \rangle = \text{average life expantancy for the first twenty years of data}
$$

I decided to average over twenty years of data to account for any short term historic events (like local wars) in a small subset of countries.

In [9]:
def norm_compare(x):
    i_value = x.head(20).mean() # average value for first 20 years
    f_value = x.tail(20).mean() # average value for last 20 years
    return (f_value - i_value) / i_value

In [11]:
norm_change=[]

for c in countries:
    norm_change.append([c, norm_compare(pop[c]), norm_compare(le[c]), norm_compare(gni[c])])

In [15]:
df = pd.DataFrame(data = norm_change, columns=['country', 'norm_pop', 'norm_le', 'norm_gni'])

In [14]:
df.set_index('country')

Unnamed: 0_level_0,norm_pop,norm_le,norm_gni
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Armenia,3.340879,1.503382,70.247748
Burundi,40.772785,1.498571,0.651286
Lebanon,14.513574,1.887542,10.456235
Grenada,3.136054,1.632643,22.665523
Brunei,186.047201,1.853938,91.732586
...,...,...,...
Malaysia,139.916217,1.759477,40.617647
Dominican Republic,104.391950,1.800669,63.722753
Czech Republic,0.984601,1.523143,62.115032
Qatar,292.404255,1.750487,143.231747


In [16]:
df.to_csv('../data/norm_change.csv')