# Correlation between life expectancy and fertility globally 1960-2015

## Outline
- load fertility, life expectancy and population from gapminder 
- slice columns 1960-2015
- merge them together via stack and unstack 
- create individual scatter plots and histograms for each year 
- concatenate them together into respective GIFs

### Loading data and cleaning up data

In [None]:
import pandas as pd
import numpy as np
import pylab as plt
from matplotlib.lines import Line2D
import seaborn as sns 
import imageio
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

In [1]:
fert=pd.read_csv("gapminder_total_fertility.csv", index_col=0)
life=pd.read_excel("gapminder_lifeexpectancy.xlsx", index_col=0)
pop=pd.read_excel('gapminder_population.xlsx', index_col=0)
cont=pd.read_csv('continents.csv', sep=';')

In [2]:
print('life df: ' + str(life.shape))
print('fert df: ' + str(fert.shape))
print('pop df: ' + str(pop.shape))
print('cont df: ' + str(cont.shape))

life df: (260, 217)
fert df: (260, 216)
pop df: (275, 81)
cont df: (194, 2)


In [3]:
print('life columns: ' + str(life.columns))
print('fert columns: ' + str(fert.columns))
print('pop columns: ' + str(pop.columns))
print('cont columns: ' + str(cont.columns))

life columns: Int64Index([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809,
            ...
            2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016],
           dtype='int64', length=217)
fert columns: Index(['1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808',
       '1809',
       ...
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015'],
      dtype='object', length=216)
pop columns: Int64Index([1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900,
            1910, 1920, 1930, 1940, 1950, 1951, 1952, 1953, 1954, 1955, 1956,
            1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
            1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978,
            1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
            1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
            2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008

In [None]:
colNames=[int(x) for x in fert.columns]
fert.set_axis(axis=1, labels=colNames, inplace=True)

In [4]:
print('fert columns: ' + str(fert.columns))

fert columns: Int64Index([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809,
            ...
            2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015],
           dtype='int64', length=216)


In [5]:
cont=cont.set_index('country')

In [None]:
targetCols=list(map(int, range(1960, 2016)))
fert2=fert[targetCols]
life2=life[targetCols]
pop2=pop[targetCols]

In [6]:
print('life df: ' + str(life2.shape))
print('fert df: ' + str(fert2.shape))
print('pop df: ' + str(pop2.shape))

life df: (260, 56)
fert df: (260, 56)
pop df: (275, 56)


### Merging fertility, population and life expectancy together via stack and unstack 

In [7]:
sfert2=fert2.stack()
slife2=life2.stack()
spop2=pop2.stack()

In [8]:
d={'fertility':sfert2, 'lifeExp':slife2, 'population':spop2} 
df1=pd.DataFrame(data=d)
df2=df1.stack()

### Creating individual plots for 1960-2015
- quick note: normalized population data so can properly see how country population affects life expectancy and fertility correlation

#### Normalization Formula(* 5000):
$$\frac{x-x_{min}}{x_{max}-x_{min}}$$

In [9]:
for i in targetCols:
    df3=df2.unstack(1)
    df3=df3[i]
    df3=df3.unstack(1)
    df3['popNorm']=((df3['population']-df3['population'].min())/
                    (df3['population'].max()-df3['population'].min()))*5000
    df4=pd.merge(cont, df3, left_index=True, right_index=True)
    colLab=df4['continent'].unique() 
    colVal=sns.color_palette('hls', 8) 
    colMap=dict(zip(colLab, colVal))  
    df4.plot.scatter('fertility', 'lifeExp', s=df4['popNorm'], alpha=0.5, figsize=(12,8), 
                     c=df4['continent'].map(colMap))
    plt.title('Life Expectancy vs Fertility in ' + str(i))
    plt.ylabel('life expectancy')
    plt.axis([0,10,0,90]) 
    customLines=[]
    for j in range(len(colLab)-1):
        customLines.append(Line2D([0],[0], color=colVal[j],lw=4))
    plt.legend(customLines, colLab, loc='best')
    plt.savefig(f'scatt4gif/scatter{i}.png')
    plt.close()

In [10]:
images=[]
for i in targetCols:
    filename='scatt4gif/scatter{}.png'.format(i)
    images.append(imageio.imread(filename))
imageio.mimsave('scatter.gif', images, fps=10) 

In [9]:
for i in targetCols:
    df3=df2.unstack(1)
    df3=df3[i]
    df3=df3.unstack(1) 
    df3['popNorm']=((df3['population']-df3['population'].min())/
                    (df3['population'].max()-df3['population'].min()))*5000
    df4=pd.merge(cont, df3, left_index=True, right_index=True)
    fig = plt.figure(figsize=(12, 8))
    ax = plt.axes(projection='3d')
    numBar=len(df4)
    xPos=df4['fertility']
    yPos=df4['lifeExp']
    zPos=np.zeros(numBar)
    xSize=np.ones(numBar)
    ySize=np.ones(numBar)
    zSize=df4['popNorm']
    colLab=df4['continent'].unique() 
    colVal=sns.color_palette('hls', 8) 
    colMap=dict(zip(colLab, colVal)) 
    ax.bar3d(xPos, yPos, zPos, xSize, ySize, zSize, color=df4['continent'].map(colMap))
    ax.set_xlabel('fertility')
    ax.set_ylabel('life expectancy')
    ax.set_zlabel('population')
    ax.set_xlim(0, 10)
    ax.set_ylim(0,90)
    ax.set_title('Life Expectancy vs Fertility in ' + str(i)) 
    customLines=[]
    for j in range(len(colLab)-1):
        customLines.append(Line2D([0],[0], color=colVal[j],lw=4))
    ax.legend(customLines, colLab, loc='best')
    plt.savefig(f'bar3Dgif/bar3D{i}.png')
    plt.close()

In [10]:
import imageio
images=[]
for i in targetCols:
    filename='bar3Dgif/bar3D{}.png'.format(i)
    images.append(imageio.imread(filename))
imageio.mimsave('bar3D.gif', images, fps=10)