## Linear Regression on Contienents Average life Expectency 

#### Life Expectency:

Life expectancy at birth indicates the number of years a newborn infant would live if prevailing patterns of mortality at the time of its birth were to stay the same throughout its life.

#### Development Relevance:
Mortality rates for different age groups (infants, children, and adults) and overall mortality indicators (life expectancy at birth or survival to a given age) are important indicators of health status in a country. Because data on the incidence and prevalence of diseases are frequently unavailable, mortality rates are often used to identify vulnerable populations. And they are among the indicators most frequently used to compare socioeconomic development across countries.

#### Limitations:
Annual data series from United Nations Population Division's World Population Prospects are interpolated data from 5-year period data. Therefore they may not reflect real events as much as observed data.


Steps:
 1. gather continent wise data
 2. format and visualize
 3. split data
 4. fit model
 5. calculate  $r^{2}$ , mean absolute error, root mean squared error


In [237]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import sklearn as sk
import seaborn as sns
import pycountry #country codes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale #data scaling
from sklearn import decomposition #PCA
from sklearn import linear_model #linear model

#library for plots
import plotly
import plotly.graph_objs as go
import plotly.offline as offline
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


TOTAL_SC = "SP.DYN.LE00.IN"
MALE_SC = "SP.DYN.LE00.MA.IN"
FEMALE_SC = "SP.DYN.LE00.FE.IN"

In [238]:
# load data
def loadData():
    pop_data = pd.read_csv('../Datasets/Population Data 1960-2050.csv', low_memory=False)
    pop_data = pop_data.replace('..', np.nan)
    pop_data = pop_data.iloc[: , 1:] 
    return pop_data

In [239]:
# missing data check
def checkForMissingData(df):
    plt.figure()
    sns.heatmap(df.isnull().transpose(),
                cmap="YlGnBu",
                cbar_kws={'label': 'Missing Data'})
    plt.title("Missing data across dataset")
    plt.show()

In [240]:
# add country code
def getCountryNonCountryData(pop_data):
    country_codes = []
    country_names = []
    for country in pycountry.countries:
        country_codes.append(country.alpha_3)
        country_names.append(country.name)
    country_codes.sort(key=str.lower)
    country_names.sort(key=str.lower)

    country_data = pop_data[pop_data.Country_Code.isin(country_codes)]
    non_country_data = pop_data[~pop_data.Country_Code.isin(country_codes)]
    non_country_data = non_country_data.iloc[: , 3:] # Removing Continent Codes for non-Nations (Will be NA anyway)
    return country_data, non_country_data

In [241]:
def getAvgLifeExpectency(country_data, seriescode):
    df = country_data[country_data.Series_Code == seriescode] #life expectecy seriescode
    return df

In [242]:
#remove unwanted columns
def formatAvgLifeExpectency(df):
    f_df = df.drop(['Continent_Name','Country_Number','Series_Name','Series_Code'], axis=1)
    years = f_df.columns[3:]
    f_df[years] = f_df[years].apply(pd.to_numeric)
    f_df.columns = f_df.columns.str.strip("[]")
    return f_df

In [246]:
#pivot data
def pivotAvgLifeExpectency(df, seriescode):
    countries = set(df["Country_Code"])
    cc = {}
    for country in countries:
        country_df = df[df['Country_Code'] == country]
        country_name =  country_df["Country_Name"].unique()
        country_df = country_df.drop(['Continent_Code','Country_Code','Country_Name'], axis=1)
        country_df = country_df.T
        country_df.index = country_df.index.rename('year')
        country_df.index =  country_df.index.astype(int)
        country_df = country_df.rename(columns = {country_df.columns[0] : country})
        country_df.reset_index(inplace=True)  
        cc[country] = [country_df, country_name]
        
        
    year = list(range(1960,2061)) #years 2020 to 2060
    combined_df = None
    for i,country in enumerate(cc.keys()):
        if(i == 0):
            combined_df = pd.DataFrame(year,columns =['year'])
            combined_df['avg_life_exp'] = cc[country][0][country]
            combined_df = combined_df.assign(country_code=country,Country_Name=cc[country][1][0])
        else:
            temp_df = cc[country][0]
            temp_df = temp_df.rename(columns={country: "avg_life_exp"})
            temp_df = temp_df.assign(country_code=country)
            combined_df = pd.concat([combined_df,temp_df])
            
    if(seriescode == TOTAL_SC):
        combined_df.to_csv(r'../Datasets/LifeExpectencyCountryTotal2020_2060.csv', index = False)
    elif(seriescode == MALE_SC):
        combined_df.to_csv(r'../Datasets/LifeExpectencyCountryMale2020_2060.csv', index = False)
    elif(seriescode == FEMALE_SC):
        combined_df.to_csv(r'../Datasets/LifeExpectencyCountryFemale2020_2060.csv', index = False)
        
    return combined_df
        


In [247]:
# plot data
def plotSliderMap(df, typename):
    data_slider = []
    for year in df['year'].unique():
        df_year = df[df['year'] == year].copy()
        for col in df_year.columns: 
            df_year[col] = df_year[col].astype(str)
        year_data = dict(
                            type='choropleth',
                            locations = df_year['country_code'],
                            z=df_year['avg_life_exp'],
                            colorscale = "greens",
                            )

        data_slider.append(year_data)
    
    steps = []
    
    for i in range(len(data_slider)):
        step = dict(method='restyle',
                    args=['visible', [False] * len(data_slider)],
                    label='Year {}'.format(i + 1960)) # label to be displayed for each step (year)
        step['args'][1][i] = True
        steps.append(step)
    
    sliders = [dict(active=0, pad={"t": 1}, steps=steps)] 
    
    layout = dict( title_text='Average Life Expectency '+ typename+ ' 1960-2060',
                  geo=dict(scope='world',
                       showcountries = True,
                       projection={'type': 'equirectangular'}),
              sliders=sliders)
    
    fig = dict(data=data_slider, layout=layout) 
    plotly.offline.iplot(fig)

        
    

In [249]:
def main():
    pop_data = loadData()
   
    #get country and non coutry data
    country_data, non_country_data = getCountryNonCountryData(pop_data)
    
    #get total population of all countries
    total_le = getAvgLifeExpectency(country_data, TOTAL_SC)
    male_le = getAvgLifeExpectency(country_data, MALE_SC)
    female_le = getAvgLifeExpectency(country_data, FEMALE_SC)
    
    #formated data
    total_le = formatAvgLifeExpectency(total_le)
    male_le = formatAvgLifeExpectency(male_le)
    female_le = formatAvgLifeExpectency(female_le)
    
    #pivot data to format
    total_le = pivotAvgLifeExpectency(total_le, TOTAL_SC)
    male_le = pivotAvgLifeExpectency(male_le, MALE_SC)
    female_le = pivotAvgLifeExpectency(female_le, FEMALE_SC)
    
    #plot
    plotSliderMap(total_le,'Total')
    plotSliderMap(male_le,'Male')
    plotSliderMap(female_le,'Female')
    
    

if __name__ == "__main__":
    main()