DSE200x Mini Project
# Q: What Does Influence Life Expectancy the Most?
What does help us to live longer or to die earlier?
Among hundreds of indicators in the data set, let's find the most titghly bound to the life expectancy.

For the Life Expectancy - I took `Life expectancy at birth, total (years)` indicator, average for both genders.<br>
For the measure of statistical relationship I took determination coefficien, by countries and years, no aggregation. <br>

For each of 1344 indicators available I calculated as follows:
1. Filter `country-year-value` data for the life expectancy
1. Filter `country-year-value` data for the indicator being tested
1. Inner-joined both by equality of country and year
1. Calculated correlation
1. Stored the indicator name, as long as its correlation is at least 0.5 in absolute value. See the full list in the bottom.

Then I listed the indicators sorted by absolute value of correlation, large to small.
The calculation processsing takes roughly half an hour.

In [None]:
# Soft part:
#  imported libraries
#  helper data processing functions
#  helper visualization functions

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from math import copysign

def get_indicator(df, indicatorCode):
    return df.loc[df.IndicatorCode == indicatorCode][['CountryCode', 'Year', 'Value']]

def get_two_indicators(df, ind1, ind2):
    df1 = get_indicator(df, ind1)
    df2 = get_indicator(df, ind2)
    dfm = pd.merge(df1, df2, how='inner', on=['CountryCode', 'Year'], suffixes=('1','2'), copy=False)
    dfm = dfm[['CountryCode','Year','Value1','Value2']]
    dfm.columns = ['CountryCode','Year','Value1','Value2']
    return dfm

def countries_coorelation(df, iX, iY):
    corrs = pd.DataFrame(columns=['Correlation', 'CountryCode', 'Count'])
    data = get_two_indicators(df, iX, iY)
    i = 0
    for country in set(data.CountryCode):
        country_data = data[ data.CountryCode == country ]
        if len(country_data) > 2:
            correlation = country_data.Value1.corr(country_data.Value2)
            i += 1
            corrs.loc[i] = (correlation, country, len(country_data))
    return corrs
    
def years_coorelation(df, iX, iY):
    corrs = pd.DataFrame(columns=['Correlation', 'Year', 'Count'])
    data = get_two_indicators(df, iX, iY)
    i = 0
    for year in set(data.Year):
        year_data = data[ data.Year == Year ]
        if len(year_data) > 2:
            correlation = year_data.Value1.corr(year_data.Value2)
            i += 1
            corrs.loc[i] = (correlation, year, len(year_data))
    return corrs
    
def indicatorName(i):
    return ser[ser.SeriesCode == i].iloc[0].IndicatorName

def coutryName(code):
    return countries[countries.CountryCode == i].iloc[0].ShortName

def scatter_color(df, iX, iY):
    d = get_two_indicators(df, iX, iY)
    X = d.Value1
    Y = d.Value2

    plt.figure(1).set_size_inches(14,5)
    plot = plt.scatter(X, Y, c=d.Year, marker=',', s=6, alpha=0.25)
    plt.xlabel( indicatorName(iX))
    plt.ylabel( indicatorName(iY))
    plt.colorbar(plot)
    
    m, b = np.polyfit(X, Y, 1)
    fitX = np.array([min(X), max(X)])
    plt.plot(fitX, fitX*m + b, 'r-', linewidth=4)
    
    r_sq = (X*m + b).var() / Y.var()
    plt.title( 'Determination {:.0f}% , corr={:.2f}'.format( r_sq*100, X.corr(Y) ), loc='right' )

    plt.grid()
    
    return plot

def demo_indicators(iX, iY):
    def signed_sqare(x):
        # converts correlation into R-sqared, but of the same sign
        return copysign( x**2, x)
    scatter_color(ind, iX, iY)
    plt.show()
    by_country = countries_coorelation(ind, iX, iY)
    plt.hist(by_country.Correlation.dropna().apply(signed_sqare) * 100,
            bins = 20)
    plt.xlim(-100, 100)
    plt.xlabel('determination, %')
    plt.ylabel('# conutries')
    plt.grid()
    plt.show()

In [None]:
# load data
ind = pd.read_csv('../input/Indicators.csv', usecols=['CountryCode','IndicatorCode','Year','Value'])
ser = pd.read_csv('../input/Series.csv')
countries = pd.read_csv('../input/Country.csv')

In [None]:
# Core of the calculation - looping through indicators to find those matching the best

def best_matching_indicators(i1):
    corrs = []
    data1 = ind[['CountryCode', 'Year', 'Value']][ ind.IndicatorCode == i1 ]
    for i2 in set(ind['IndicatorCode']):
        data2 = ind[['CountryCode', 'Year', 'Value']][ ind.IndicatorCode == i2 ]
        data = pd.merge( data1, data2, on=['CountryCode', 'Year'], suffixes=('i1','i2'), copy=False)
        cor = data.Valuei1.corr(data.Valuei2)
        corrs.append( (cor, i2) )
    corrs.sort(key=lambda cr: -abs(cr[0]))
    print(corrs)
    for cr, i in [(cr, i) for cr, i in corrs if abs(cr)>0.5]:
        print(i, round(cr, 2))

# best_matching_indicators('SP.DYN.LE00.IN') # Life expectancy at birth, total (years)

# The Best Matchers
|   | Corr | Detrmination | Indicator Code * | Indicator Name |
|---|--------------|-----|--------------------|--------------------|
| <img src="https://image.flaticon.com/icons/svg/190/190205.svg" height=30 width=30 /> | -0.88 | 77% | SP.DYN.CBRT.IN | Birth rate, crude (per 1,000 people) |
| <img src="https://image.flaticon.com/icons/svg/0/422.svg" height=30 width=30 /> | +0.87 | 76% | SE.SEC.NENR | School enrollment, secondary (% net) |
| <img src="https://image.flaticon.com/icons/svg/498/498227.svg" height=30 width=30 /> | -0.75 | 56% | NV.AGR.TOTL.ZS | Agriculture, value added (% of GDP) |
| <img src="https://image.flaticon.com/icons/svg/263/263924.svg" height=30 width=30 />  | -0.75 | 56% | EG.USE.CRNW.ZS | Combustible renewables and waste (% of total energy) |
| <img src="https://image.flaticon.com/icons/svg/134/134954.svg" height=30 width=30>  | +0.72 | 52% | IT.MLT.MAIN.P2 | Fixed telephone subscriptions (per 100 people) |
| <img src="https://image.flaticon.com/icons/svg/138/138281.svg" height=30 width=30> | +0.64 | 41% | NE.CON.PRVT.PC.KD | Household final consumption expenditure per capita (constant 2005 US$) |
and more

Note the first five indicators correlate better than wealth, the sixth row.<br>
Wealth only explains 41% of difference in life expectancy.

\* Bornigly correlating indicators sorted out (such as  life expectancy by categories, mortality rates, etc.) <br>
\* Icons made by Freepik, Roundicons, Smashicons from [www.flaticon.com](http://www.flaticon.com) </img>

## Birth rate, crude (per 1,000 people)
Surprizingly and sad, birth rate negatively correlates to life expectancy nearly as stron as various mortality rates (see the full list in the bottom)

Histgram of correlations is also imressive. Out of 247 countries:
* 100 countries fit nearly perfect, 80% to 100%
* majority of others are 50% to 80%
* a handful of countries have weak positive correlation, below 30%
* there are no countries with positive moderate or strong correlation

Both domain and range are wide, making the tendency even more cinvincive.

Open questions for possible futher investigation:
* what does make exceptional countries with moderate positive relationship?
* for each birth rate value still there is 20 years worth gap between top and bottom. What does make those different?

In [None]:
demo_indicators('SP.DYN.CBRT.IN', 'SP.DYN.LE00.IN')

## School enrollment, secondary (% net)
The biggest surprise of the paper. Top of the top indicators among those which cannot be attributed to sickness/mortality things neither directly nor indirectly.<br>
There is a number of similar indicators fitting about the same, e.g. different level school enrollments. I will describe only this one.

The indicator is available for 203 countries only. Out of these:
* 110, the majority fit perfect 80% to 100%
* most other fit at about 75%
* about 20 have weak positive or negative relationship, -25% to +25%
* few have negative correlation

Gives a fresh look at the life-long learning perspective.<br>
Admission level of 80% and more assures at least 65 years life expectancy, nomatter is it 1970ies or2010s.<br>
And vice versa - admission of 20% or less guarantees for expectancy under 65.

In [None]:
print( "Countries involved: {}".format(len(set(ind[ind.IndicatorCode == 'SE.SEC.NENR'].CountryCode))) )

In [None]:
demo_indicators('SE.SEC.NENR', 'SP.DYN.LE00.IN')

## Agriculture, value added (% of GDP)
Looking at the spread width and not-as-good correlation,
 I suggest this is merely a coincidence of two global trends:
 1. steady reduction of agriculture fractin in world GDP 
 1. steady increase in the worldwide average of life expectany
 
 As both are naturally changing slow, they are doomed to statistically fit.
 
Relative abundance of old years dots in the lower-left prompts the smaller contribution of agriculture is no guarantee for a better life expectancy.<br>
50% of agriculture and more assures life expectancy under 55, however in recent years there is a handful of countries reaching 50% (see yelow dots)<br>
On the other hand lower agriculture contribution says little - expectancy can be anywhere between 40 and 75 years.

In [None]:
demo_indicators('NV.AGR.TOTL.ZS', 'SP.DYN.LE00.IN')

## Combustive renewables and waste (% of total energy)
Wide spread of 45 to 75 years at almost entire domain of indicator values.<br>
Country histogram is dreadful - comparable number of countries have strong negative and strong positive correlation.

Overall the indicator is not that definitive, and mostly controversial.<br>
This is rather an artifact of several global trends coinciding.

Or, need further investigation for the criterion to tell positivey related countries form negatively related.

In [None]:
demo_indicators('EG.USE.CRNW.ZS', 'SP.DYN.LE00.IN')

## Fixed telephone subscriptions (per 100 people)
Graph and histogram say indicator is positively and strongly bound to the life expectancy,
 however it is far from linear.
 
 Roughly there all the countries with at least 10 phone landlines per 100 people have at least 65 years life expectancy.<br>
 Less then 10 lines tell nothing - can be anywhere between 40 and 80 years.
 
 Interestingly enough the tendency is still there in 2010s (yellow dots)<br>
 Did anyone hear about the age of wireless phones?
 
 Open question: is it only about calling for ambulance in time?

In [None]:
demo_indicators('IT.MLT.MAIN.P2', 'SP.DYN.LE00.IN')

## Wholesale Price Index (2010 = 100)
Correlation and country histogram look good

In [None]:
demo_indicators('FP.WPI.TOTL', 'SP.DYN.LE00.IN')

## Household final consumption expenditure per capita (constant 2005 US \$ )
Love is not the only thing money cannot buy you :)

Almost anything above 3,000\$ per capita annually is all the same game - between 70 and 80 years.<br>
Also yellow/purple points in the high wage area shows it is years passing rather than wealth level that improves the result.

On the other hand area of under 3,000\$ and over 70 years is densly populated.<br>
Sort of being poor is not being condemned.

In [None]:
demo_indicators('NE.CON.PRVT.PC.KD', 'SP.DYN.LE00.IN')

Please take a look below at the per-country examples for how diverse the dependency is.<br>
Though country-wise histogram looks good enough.

In [None]:
# row and column sharing
f, sub_plots = plt.subplots(4, 4, sharex='all', sharey='all')
f.set_size_inches(14,14)
sub_plots = [ x for a in sub_plots for x in a ]

for cList in [
    ['WLD'],['ARE'],['TUR'],['SYR'],
    ['BGR'],['BRB'],['BGD'],['BHR'],
    ['AUS'],['USA'],['UKR'],['CHE'],
    ['GBR'],['MKD'],['RUS'],['JPN']]:
    d = get_two_indicators(
        ind[ind.CountryCode.isin(cList)],
        'NE.CON.PRVT.PC.KD', 'SP.DYN.LE00.IN')
    p = sub_plots.pop(0)
    p.scatter(d.Value1, d.Value2, c=d.Year, marker=',', s=4)
    p.set_title("+".join(countries[countries.CountryCode.isin(cList)].ShortName), loc='left')

# All Indicators Correlating at Least 50%
Either positively or negatively

| Indicator Code | Correlation with life expectancy |
|----------|----------|
| SP.DYN.TO65.FE.ZS | 0.99 |
| SP.DYN.TO65.MA.ZS | 0.97 |
| SP.DYN.AMRT.FE | -0.95 |
| SH.DYN.MORT | -0.94 |
| SP.DYN.IMRT.IN | -0.94 |
| SP.DYN.AMRT.MA | -0.92 |
| SP.DYN.CBRT.IN | -0.88 |
| SE.SEC.NENR.FE | 0.87 |
| SE.SEC.NENR.MA | 0.87 |
| SE.SEC.NENR | 0.87 |
| SE.SEC.ENRR.FE | 0.87 |
| SE.SEC.ENRR | 0.86 |
| SP.DYN.CDRT.IN | -0.85 |
| SP.DYN.TFRT.IN | -0.85 |
| SE.SEC.ENRR.MA | 0.85 |
| SE.SEC.CMPT.LO.FE.ZS | 0.84 |
| SH.STA.MMRT.NE | -0.83 |
| SE.PRM.PRSL.FE.ZS | 0.83 |
| SE.PRM.TENR.FE | 0.82 |
| SE.SEC.CMPT.LO.ZS | 0.82 |
| SE.PRM.CMPT.FE.ZS | 0.82 |
| SE.PRM.CMPT.ZS | 0.81 |
| SE.PRM.TENR | 0.81 |
| SE.PRM.NENR | 0.81 |
| SE.PRM.PRSL.ZS | 0.8 |
| SE.SEC.CMPT.LO.MA.ZS | 0.8 |
| SE.PRM.NENR.FE | 0.79 |
| SP.DYN.CONU.ZS | 0.79 |
| SE.PRM.PRSL.MA.ZS | 0.79 |
| SE.PRM.TENR.MA | 0.78 |
| SE.PRM.CMPT.MA.ZS | 0.77 |
| SP.ADO.TFRT | -0.77 |
| SP.POP.0014.TO.ZS | -0.77 |
| SE.PRM.ENRL.TC.ZS | -0.76 |
| SP.POP.DPND.YG | -0.76 |
| SE.ADT.LITR.ZS | 0.76 |
| SE.PRM.PRS5.FE.ZS | 0.76 |
| SE.ADT.LITR.FE.ZS | 0.76 |
| SE.PRM.NENR.MA | 0.75 |
| NV.AGR.TOTL.ZS | -0.75 |
| SE.ADT.LITR.MA.ZS | 0.75 |
| EG.USE.CRNW.ZS | -0.75 |
| SP.RUR.TOTL.ZS | -0.74 |
| SP.URB.TOTL.IN.ZS | 0.74 |
| SP.POP.1564.TO.ZS | 0.74 |
| SE.PRM.PRS5.ZS | 0.74 |
| SE.PRM.TCHR.FE.ZS | 0.74 |
| SP.POP.DPND | -0.73 |
| SE.PRM.PRS5.MA.ZS | 0.73 |
| SE.PRE.ENRR.FE | 0.72 |
| SE.SEC.PROG.FE.ZS | 0.72 |
| IT.MLT.MAIN.P2 | 0.72 |
| SE.PRE.ENRR.MA | 0.72 |
| SE.PRE.ENRR | 0.71 |
| SE.SEC.ENRL.GC.FE.ZS | 0.71 |
| SE.TER.ENRR.MA | 0.71 |
| SE.SEC.PROG.ZS | 0.7 |
| SE.ENR.PRSC.FM.ZS | 0.7 |
| SE.TER.ENRR | 0.7 |
| SE.PRM.REPT.FE.ZS | -0.69 |
| SE.ENR.SECO.FM.ZS | 0.69 |
| SE.TER.ENRR.FE | 0.68 |
| SH.STA.STNT.ZS | -0.68 |
| SE.SEC.ENRL.FE.ZS | 0.68 |
| SE.SEC.PROG.MA.ZS | 0.68 |
| SH.STA.STNT.MA.ZS | -0.68 |
| SE.ENR.PRIM.FM.ZS | 0.68 |
| SE.SEC.TCHR.FE.ZS | 0.67 |
| SP.POP.65UP.TO.ZS | 0.67 |
| SE.PRM.REPT.ZS | -0.67 |
| SE.PRM.REPT.MA.ZS | -0.65 |
| SP.POP.DPND.OL | 0.65 |
| NE.CON.PRVT.PC.KD | 0.64 |
| NY.ADJ.NNTY.PC.KD | 0.64 |
| SH.STA.STNT.FE.ZS | -0.63 |
| SH.MED.PHYS.ZS | 0.63 |
| NY.GNP.PCAP.KD | 0.63 |
| SE.PRM.ENRR.FE | 0.62 |
| SE.PRM.ENRL.FE.ZS | 0.6 |
| EG.USE.COMM.FO.ZS | 0.58 |
| FP.WPI.TOTL | 0.58 |
| NY.GDP.PCAP.KD | 0.58 |
| SE.ENR.TERT.FM.ZS | 0.57 |
| NY.ADJ.NNTY.PC.CD | 0.57 |
| IT.NET.USER.P2 | 0.57 |
| NV.SRV.TETC.ZS | 0.57 |
| SE.XPD.PRIM.ZS | -0.57 |
| SE.SEC.ENRL.TC.ZS | -0.56 |
| EG.USE.ELEC.KH.PC | 0.56 |
| NV.MNF.FBTO.ZS.UN | -0.55 |
| NY.GNP.PCAP.CD | 0.54 |
| TX.VAL.MANF.ZS.UN | 0.54 |
| SH.STA.MALN.ZS | -0.54 |
| SE.SEC.REPT.FE.ZS | -0.53 |
| SH.STA.MALN.MA.ZS | -0.53 |
| SE.PRM.ENRR | 0.53 |
| SE.TER.TCHR.FE.ZS | 0.53 |
| NY.GDP.PCAP.CD | 0.52 |
| NV.MNF.MTRN.ZS.UN | 0.5 |
| EG.USE.PCAP.KG.OE | 0.5 |

# The End
Thank you for reading!