In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../../DATA/led.csv')

In [4]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
Country,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan
Year,2015,2014,2013,2012,2011
Status,Developing,Developing,Developing,Developing,Developing
Lifeexpectancy,65,59.9,59.9,59.5,59.2
AdultMortality,263,271,268,272,275
infantdeaths,62,64,66,69,71
Alcohol,0.01,0.01,0.01,0.01,0.01
percentageexpenditure,71.2796,73.5236,73.2192,78.1842,7.09711
HepatitisB,65,62,64,67,68
Measles,1154,492,430,2787,3013


In [5]:
df['Mortality'] = 1 - (1-df['infantdeaths']/1000) * (1-df['under-fivedeaths']/1000) * (1-df['AdultMortality']/1000) 

In [9]:
df['Immunization'] = df.apply(lambda X: min(X['HepatitisB']/100, X['Polio']/100, X['Diphtheria']/100), axis=1)

In [10]:
df['Disease'] = 1 - (1-df['Measles']/1000) * (1-df['HIV/AIDS']/1000)

In [11]:
df.columns

Index(['Country', 'Year', 'Status', 'Lifeexpectancy', 'AdultMortality',
       'infantdeaths', 'Alcohol', 'percentageexpenditure', 'HepatitisB',
       'Measles', 'BMI', 'under-fivedeaths', 'Polio', 'Totalexpenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness1-19years',
       'thinness5-9years', 'Incomecompositionofresources', 'Schooling',
       'Mortality', 'Immunization', 'Disease'],
      dtype='object')

In [12]:
df = df[['Country', 'Year', 'Lifeexpectancy', 'Mortality', 'Totalexpenditure', 'Immunization', 'Disease']]

In [13]:
df.columns

Index(['Country', 'Year', 'Lifeexpectancy', 'Mortality', 'Totalexpenditure',
       'Immunization', 'Disease'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,Country,Year,Lifeexpectancy,Mortality,Totalexpenditure,Immunization,Disease
0,Afghanistan,2015,65.0,0.366072,8.16,0.06,1.153985
1,Afghanistan,2014,59.9,0.376338,8.18,0.58,0.492051
2,Afghanistan,2013,59.9,0.37716,8.13,0.62,0.430057
3,Afghanistan,2012,59.5,0.385264,8.52,0.67,2.786821
4,Afghanistan,2011,59.2,0.391807,7.87,0.68,3.012799


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 7 columns):
Country             2938 non-null object
Year                2938 non-null int64
Lifeexpectancy      2928 non-null float64
Mortality           2928 non-null float64
Totalexpenditure    2712 non-null float64
Immunization        2385 non-null float64
Disease             2938 non-null float64
dtypes: float64(5), int64(1), object(1)
memory usage: 160.8+ KB


In [19]:
df['Year'].unique()

array([2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005,
       2004, 2003, 2002, 2001, 2000])

In [22]:
df = df.groupby('Country').mean().drop('Year', axis=1)

In [23]:
df

Unnamed: 0_level_0,Lifeexpectancy,Mortality,Totalexpenditure,Immunization,Disease
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,58.19375,0.398749,8.252500,0.482500,2.362114
Albania,75.15625,0.046623,5.945625,0.977500,0.053470
Algeria,73.61875,0.146842,4.604000,0.780000,1.943781
Angola,49.01875,0.467314,3.919333,0.561111,3.555669
AntiguaandBarbuda,75.05625,0.127500,4.791333,0.966000,0.000125
...,...,...,...,...,...
Venezuela(BolivarianRepublicof),73.38750,0.179755,4.998667,0.538125,0.165083
VietNam,74.77500,0.182990,5.977333,0.875385,4.232583
Yemen,63.86250,0.281984,5.005333,0.545625,2.761011
Zambia,53.90625,0.407646,5.824000,0.481818,6.476690


In [25]:
from sklearn.preprocessing import StandardScaler

In [29]:
scaler = StandardScaler()

In [30]:
df_scaled = scaler.fit_transform(df)

In [34]:
df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

In [35]:
df_scaled

Unnamed: 0_level_0,Lifeexpectancy,Mortality,Totalexpenditure,Immunization,Disease
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,-1.204107,1.301881,1.008619,-1.270617,0.008165
Albania,0.647432,-1.072585,-0.037475,1.140599,-0.285843
Algeria,0.479607,-0.396782,-0.645859,0.178548,-0.045110
Angola,-2.205603,1.764229,-0.956333,-0.887691,0.160165
AntiguaandBarbuda,0.636517,-0.527210,-0.560909,1.084580,-0.292636
...,...,...,...,...,...
Venezuela(BolivarianRepublicof),0.454364,-0.174841,-0.466890,-0.999660,-0.271629
VietNam,0.605817,-0.153027,-0.023096,0.643180,0.246370
Yemen,-0.585335,0.514512,-0.463867,-0.963126,0.058965
Zambia,-1.672108,1.361877,-0.092628,-1.273938,0.532159


In [36]:
df_scaled.corr()

Unnamed: 0,Lifeexpectancy,Mortality,Totalexpenditure,Immunization,Disease
Lifeexpectancy,1.0,-0.797103,0.290713,0.456931,-0.200518
Mortality,-0.797103,1.0,-0.277774,-0.448449,0.58291
Totalexpenditure,0.290713,-0.277774,1.0,-0.069219,-0.158519
Immunization,0.456931,-0.448449,-0.069219,1.0,-0.224033
Disease,-0.200518,0.58291,-0.158519,-0.224033,1.0


In [42]:
from sklearn.linear_model import LinearRegression

In [37]:
def find_correlation(df, var1, var2, *args):
    "Find the correlation between two variables conditional on other variables."
    features = [var1, var2]
    features.extend(list(args))
    df_new = df[features]
    df_new = df_new.dropna()
    X = df_new.drop(var1, axis=1)
    y = df_new[[var1]]
    
    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X)
    scaled_y = scaler.fit_transform(y)
    
    model = LinearRegression()
    model.fit(scaled_X, scaled_y)
    return model.coef_[0,0]

In [39]:
df.columns

Index(['Lifeexpectancy', 'Mortality', 'Totalexpenditure', 'Immunization',
       'Disease'],
      dtype='object')

In [43]:
find_correlation(df, 'Immunization', 'Totalexpenditure')

-0.06921858672161625

In [45]:
find_correlation(df, 'Immunization', 'Totalexpenditure', 'Lifeexpectancy')

0.034064132280311556