In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [39]:
who = pd.read_csv('Life Expectancy Data.csv') #2000/2015
who.head(17) 

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
5,Afghanistan,2010,Developing,58.8,279.0,74,0.01,79.679367,66.0,1989,16.7,102,66.0,9.2,66.0,0.1,553.32894,2883167.0,18.4,18.4,0.448,9.2
6,Afghanistan,2009,Developing,58.6,281.0,77,0.01,56.762217,63.0,2861,16.2,106,63.0,9.42,63.0,0.1,445.893298,284331.0,18.6,18.7,0.434,8.9
7,Afghanistan,2008,Developing,58.1,287.0,80,0.03,25.873925,64.0,1599,15.7,110,64.0,8.33,64.0,0.1,373.361116,2729431.0,18.8,18.9,0.433,8.7
8,Afghanistan,2007,Developing,57.5,295.0,82,0.02,10.910156,63.0,1141,15.2,113,63.0,6.73,63.0,0.1,369.835796,26616792.0,19.0,19.1,0.415,8.4
9,Afghanistan,2006,Developing,57.3,295.0,84,0.03,17.171518,64.0,1990,14.7,116,58.0,7.43,58.0,0.1,272.56377,2589345.0,19.2,19.3,0.405,8.1


In [45]:
#who.loc[who[' BMI ']<5, 'Country']

### who.columns

In [4]:
#there are spaces in column names, gets tricky when I want to call them. rename them:
who.rename(columns={'Life expectancy ':'Life expectancy', 'Measles ':'Measles', ' BMI ':'BMI',
                    'under-five deaths ':'under-five deaths', 'Diphtheria ':'Diphtheria',
                    ' HIV/AIDS':'HIV/AIDS', ' thinness  1-19 years':'thinness  1-19 years',
                    ' thinness 5-9 years':'thinness 5-9 years','percentage expenditure':'% health financing',
                    'Total expenditure':'Total health financing'}, inplace=True)

In [5]:
who.shape #(2938, 22) #life expectancy is the target. 20 predictors (drop countries? population?)
who.dtypes #everything is already in ordinal except for Status (categorical)

Country                             object
Year                                 int64
Status                              object
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
% health financing                 float64
Hepatitis B                        float64
Measles                              int64
BMI                                float64
under-five deaths                    int64
Polio                              float64
Total health financing             float64
Diphtheria                         float64
HIV/AIDS                           float64
GDP                                float64
Population                         float64
thinness  1-19 years               float64
thinness 5-9 years                 float64
Income composition of resources    float64
Schooling                          float64
dtype: object

In [6]:
who.Status.value_counts() #looks great, only 2 categories (get_dummies)

Developing    2426
Developed      512
Name: Status, dtype: int64

In [7]:
who.columns

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', '% health financing', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio',
       'Total health financing', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population',
       'thinness  1-19 years', 'thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [8]:
print(who['BMI'].min(), who['BMI'].max())
print(who['Income composition of resources'].min(), who['Income composition of resources'].max()) #percentager

1.0 87.3
0.0 0.948


In [9]:
#deal with missing values
who.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
% health financing                   0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total health financing             226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

In [10]:
print('Proportions of missing values in:')
print(f"Alcohol {round((194/2938)*100, 2)}%")
print(f"Hepatitis B {round((553/2938)*100, 2)}%")
print(f"Total health financing {round((226/2938)*100, 2)}%")
print(f"GDP {round((448/2938)*100, 2)}%")
print(f"Population {round((652/2938)*100, 2)}%")
print(f"Income composition of resources {round((167/2938)*100, 2)}%")
print(f"Schooling {round((163/2938)*100, 2)}%")

Proportions of missing values in:
Alcohol 6.6%
Hepatitis B 18.82%
Total health financing 7.69%
GDP 15.25%
Population 22.19%
Income composition of resources 5.68%
Schooling 5.55%


In [11]:
who.columns

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', '% health financing', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio',
       'Total health financing', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population',
       'thinness  1-19 years', 'thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [12]:
#drop Total health financing, I have a column with the percentage of it already
#drop Countries and Population (Population is related)
#drop Hepatitis B (almost 20% of Nans)
#also drop Year, no need
#in the end: drop rows with Nans

In [13]:
who.drop(columns=['Total health financing','Country','Population','Hepatitis B','Year'], inplace=True)

In [14]:
who.shape #(2938, 18)

(2938, 17)

In [15]:
who_clean = who.dropna(axis=0)

In [16]:
who_clean.shape #2304

(2304, 17)

In [17]:
who_corr = who.corr()
heatmap = who_corr.style.background_gradient(cmap='RdYlGn',axis=None).set_precision(2)
heatmap #nice correlations

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,% health financing,Measles,BMI,under-five deaths,Polio,Diphtheria,HIV/AIDS,GDP,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
Life expectancy,1.0,-0.7,-0.2,0.4,0.38,-0.16,0.57,-0.22,0.47,0.48,-0.56,0.46,-0.48,-0.47,0.72,0.75
Adult Mortality,-0.7,1.0,0.079,-0.2,-0.24,0.031,-0.39,0.094,-0.27,-0.28,0.52,-0.3,0.3,0.31,-0.46,-0.45
infant deaths,-0.2,0.079,1.0,-0.12,-0.086,0.5,-0.23,1.0,-0.17,-0.18,0.025,-0.11,0.47,0.47,-0.15,-0.19
Alcohol,0.4,-0.2,-0.12,1.0,0.34,-0.052,0.33,-0.11,0.22,0.22,-0.049,0.35,-0.43,-0.42,0.45,0.55
% health financing,0.38,-0.24,-0.086,0.34,1.0,-0.057,0.23,-0.088,0.15,0.14,-0.098,0.9,-0.25,-0.25,0.38,0.39
Measles,-0.16,0.031,0.5,-0.052,-0.057,1.0,-0.18,0.51,-0.14,-0.14,0.031,-0.076,0.22,0.22,-0.13,-0.14
BMI,0.57,-0.39,-0.23,0.33,0.23,-0.18,1.0,-0.24,0.28,0.28,-0.24,0.3,-0.53,-0.54,0.51,0.55
under-five deaths,-0.22,0.094,1.0,-0.11,-0.088,0.51,-0.24,1.0,-0.19,-0.2,0.038,-0.11,0.47,0.47,-0.16,-0.21
Polio,0.47,-0.27,-0.17,0.22,0.15,-0.14,0.28,-0.19,1.0,0.67,-0.16,0.21,-0.22,-0.22,0.38,0.42
Diphtheria,0.48,-0.28,-0.18,0.22,0.14,-0.14,0.28,-0.2,0.67,1.0,-0.16,0.2,-0.23,-0.22,0.4,0.43


In [18]:
who_clean.dtypes

Status                              object
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
% health financing                 float64
Measles                              int64
BMI                                float64
under-five deaths                    int64
Polio                              float64
Diphtheria                         float64
HIV/AIDS                           float64
GDP                                float64
thinness  1-19 years               float64
thinness 5-9 years                 float64
Income composition of resources    float64
Schooling                          float64
dtype: object

In [19]:
who = pd.get_dummies(who_clean, drop_first=True)
who.Status_Developing.value_counts()

1    1881
0     423
Name: Status_Developing, dtype: int64

In [20]:
who.head()

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,% health financing,Measles,BMI,under-five deaths,Polio,Diphtheria,HIV/AIDS,GDP,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Status_Developing
0,65.0,263.0,62,0.01,71.279624,1154,19.1,83,6.0,65.0,0.1,584.25921,17.2,17.3,0.479,10.1,1
1,59.9,271.0,64,0.01,73.523582,492,18.6,86,58.0,62.0,0.1,612.696514,17.5,17.5,0.476,10.0,1
2,59.9,268.0,66,0.01,73.219243,430,18.1,89,62.0,64.0,0.1,631.744976,17.7,17.7,0.47,9.9,1
3,59.5,272.0,69,0.01,78.184215,2787,17.6,93,67.0,67.0,0.1,669.959,17.9,18.0,0.463,9.8,1
4,59.2,275.0,71,0.01,7.097109,3013,17.2,97,68.0,68.0,0.1,63.537231,18.2,18.2,0.454,9.5,1


In [21]:
who.dtypes

Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
% health financing                 float64
Measles                              int64
BMI                                float64
under-five deaths                    int64
Polio                              float64
Diphtheria                         float64
HIV/AIDS                           float64
GDP                                float64
thinness  1-19 years               float64
thinness 5-9 years                 float64
Income composition of resources    float64
Schooling                          float64
Status_Developing                    uint8
dtype: object

In [22]:
who.describe()

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,% health financing,Measles,BMI,under-five deaths,Polio,Diphtheria,HIV/AIDS,GDP,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Status_Developing
count,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0,2304.0
mean,69.338455,161.506944,31.644965,4.623472,939.44686,2413.818576,38.097222,43.99566,82.679688,82.578125,1.976302,7617.916013,4.861545,4.903125,0.63099,12.126172,0.816406
std,9.712894,127.875825,129.477633,4.026808,2202.564497,11312.768375,19.816297,176.302342,23.096802,23.381987,5.626433,14535.338226,4.528618,4.621498,0.21317,3.306716,0.387237
min,36.3,1.0,0.0,0.01,0.0,0.0,1.4,0.0,3.0,2.0,0.1,1.68135,0.1,0.1,0.0,0.0,0.0
25%,63.2,69.0,0.0,0.895,35.841481,0.0,18.975,0.0,78.0,79.0,0.1,454.891745,1.6,1.6,0.491,10.1,1.0
50%,72.2,137.0,2.0,3.965,155.315287,15.0,43.5,3.0,93.0,93.0,0.1,1759.357808,3.3,3.3,0.683,12.4,1.0
75%,76.0,224.0,19.0,7.5925,641.192963,353.25,55.9,24.0,97.0,97.0,0.9,5945.671748,7.1,7.2,0.787,14.425,1.0
max,89.0,723.0,1800.0,17.87,19479.91161,212183.0,77.1,2500.0,99.0,99.0,50.6,119172.7418,27.7,28.6,0.948,20.7,1.0


In [23]:
#split in train and test set
y = who['Life expectancy']
X = who.drop(['Life expectancy'], axis = 1)

# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2, random_state=29)

In [24]:
regr = LinearRegression()
model = regr.fit(X_std, y)
acc = model.score(X_test,y_test)*100
print(f"Model (Linear Regression) Test Accuracy {round(acc, 2)}%")

Model (Linear Regression) Test Accuracy 82.19%
