In [None]:
import os
import pandas as pd
import numpy as np
import IPython
import seaborn as sns

import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
%matplotlib inline

In [None]:
print(os.listdir())
os.listdir('life-expectancy-who')

In [None]:
continent = pd.read_csv('life-expectancy-who/countryContinent.csv', skipinitialspace=True, encoding = "ISO-8859-1")

df = pd.read_csv('life-expectancy-who/Life Expectancy Data.csv', parse_dates=True)

In [None]:
continent['continent'].value_counts()

In [None]:
continent_dict = {}
for i in continent['country']:
    cont = continent[continent['country']==i]['continent']
    c = cont.values[0]
    continent_dict[i] = c

In [None]:
update = {'Republic of Korea' : 'Asia', 'Republic of Moldova' : 'Europe' ,'Democratic Republic of the Congo' : 'Africa', 'Czechia' : 'Europe', 'United Republic of Tanzania' : 'Africa', 'The former Yugoslav republic of Macedonia' : 'Europe', 'Democratic Peoples Republic of Korea' : 'Asia'}

continent_dict.update(update)

In [None]:
df['continent'] = df['Country'].copy()
df['continent'] = df['continent'].replace(continent_dict)
print(df['continent'].value_counts())
df.head()

In [None]:
new_columns= []
columns = list(df.columns)
for col in list(df.columns):
    a = col.split()
    b = '_'.join(a)
    new_columns.append(b)
df.columns =new_columns


df.columns

In [None]:
# Checking the information of the data
df.info()

In [None]:
#Checking the statistical information of the numerical dataset

df.describe()

In [None]:
#Cheching the distribution of missing value in the data, where hite spaces denote missing value

plt.figure(figsize=(8,4))
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap = 'viridis')

### OOPS!!!

Even our target column (Life_expectancy) has missing values, we need to look into this.

------------------
------------------

In [None]:
df[df['Life_expectancy'].isnull()]
#df.head()

These countries don't look very familiar, let's check how many times these countries appeared in the dataset.

In [None]:
df.Country.value_counts().tail(15)

##### They appeared only ones, pheeeew! 

Well it is safe to drop them. We will be dropping them after dealing with the other missing data using interpolation.

In [None]:
col = list(df.select_dtypes(include=['int64', 'float64']).columns)
col.remove('Life_expectancy')
col.remove('Adult_Mortality')
df[col]=df[col].interpolate(axis=1)

In [None]:
df.shape

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.Country.value_counts(ascending=True).tail(15)

In [None]:
df.tail()

In [None]:
df[['Country', 'Status']]=df[['Country', 'Status']].astype('category')

In [None]:
#report = ProfileReport(df)
#report.to_file('neppp.html')

# Graphical Exploratory Data Analysis

In [None]:
#Creating a function that check the correlation between

def heatmap(data, country):
    a = df['Country'].value_counts()
    b = list(a.index)
    assert country in b, 'country not found in dataset'
    correlation = df[df['Country']==country].corr()*100
    plt.figure(figsize=(12,7))
    sns.heatmap(correlation, annot = True, cbar=False, cmap = 'viridis')

In [None]:
heatmap(df, 'Russian Federation')

In [None]:
#Visualizing average life expectancy by continent per year
def feature_cpy(year, feature, style=None):
    global df
    plt.figure(figsize=(8,6))
    a = df.groupby(['Year','continent']).mean()
    y = a.loc[year][feature]
    x = a.loc[year].index
    if style == None:
        sns.set_style('darkgrid')
    else:
        sns.set_style(style)
    sns.barplot(x, y)
    plt.title(f'A plot of average {feature} in {year} per continent', fontsize=30)

feature_cpy(2010, 'GDP')

In [None]:
df[df['Year'] == 2015][df[df['Year'] == 2015]['GDP']>100000]

In [None]:
#Plan to make this more interactive using Bokeh

plt.figure(figsize=(20,10))
x = df[df['Year'] == 2000]['Country']
y = df[df['Year'] == 2000]['Life_expectancy']
s = df[df['Year'] == 2000]['GDP']
cmap = df[df['Year'] == 2000]['continent']
a = []
for i in list(cmap):
    a.append(list(cmap).index(i))
sns.set_style('white')
plt.scatter(x,y, s=s**0.7, alpha=0.3, c=a, cmap='tab10')
plt.xticks(rotation=90)
plt.xlabel('Countries')
plt.ylabel('GDP');
#plt.legend('tab10', list(cmap.value_counts().index))

----------------

In [None]:
def plot(data, col, x_val, y_val, specific, save=False, save_name=None):
    '''Take in 5 argument
    data : dataframe containing the data
    col : categorical column in the dataset
    x_val : the value to be on the x axis of the plot
    y_val : the value to be on the y axis of the plot
    specific :  the category in the categorical column to be specialized on
    save(optional) : Boolean, determines if the plot should be saved to file or not
    save_name(depend on save) : if save is True, then save_name must be entered'''
    plt.grid(True)
    df = data[data[col] == specific]
    x = df[x_val]
    y = df[y_val]
    plt.plot(x, y, linestyle = '--')
    plt.scatter(x, y, c='r', marker='x')
    plt.xlabel(x_val)
    plt.ylabel(y_val)
    plt.title(f'{x_val} against {y_val} ({specific})', fontsize=20)
    if save:
        plt.savefig(save_name)   
    plt.show()

In [None]:
plot(df,'Country', 'Year', 'GDP','Nigeria')#, save=True, save_name='ife')

In [None]:
df['Life_expectancy'].max()

In [None]:
sns.countplot(x='Status', data=df)

We have a bias in the Status column as we have more Developing countries than Developed countrties

In [None]:
#report = ProfileReport(df)
#report.to_widgets()

In [None]:
dd = df[df['Year']==2015]
dd.reset_index(drop=True, inplace=True)
plt.figure(figsize=(20,10))
plt.scatter(x=dd.Country, y=dd['Life_expectancy'], alpha=0.5, marker='.', c='g', s=dd['GDP']**0.7)
plt.xlabel('Country')
plt.ylabel('GDP')
plt.xticks(rotation = 90)
plt.title('Country v GDP')

In [None]:
from sklearn.preprocessing import Normalizer

df['avg_child_dr'] = df['under-five_deaths']/2 + df['infant_deaths']/2
df['avg_thinness'] = (df['thinness_1-19_years'] + df['thinness_5-9_years'])/2

df.drop(axis=1, columns=['thinness_1-19_years', 'thinness_5-9_years'], inplace=True)#, 'under-five_deaths',
                        #'infant_deaths'], inplace=True)

num_col = list(df.select_dtypes(include=['float64', 'int64']).columns)
num_col.remove('Year')
num_col.remove('Life_expectancy')
norm = Normalizer()
df[num_col] = norm.fit_transform(df[num_col])

In [None]:
df.head()

In [None]:
#report = ProfileReport(df)
#report.to_widgets()

In [None]:
heatmap(df, 'Nigeria')

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(df.corr(), annot = True, cbar=False, cmap = 'viridis')

In [None]:
df = pd.get_dummies(df, drop_first=True, columns=['Status', 'continent'])
df.columns

In [None]:
#Run this cell only once

dff = df.copy()
def linear(dff, country=None):
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    lr = LinearRegression()
    if country == None:
        pass
    else:
        dff = dff[dff['Country']==country]
    dff = dff.drop(axis=1, columns='Country')
    ttest = dff[dff['Year']==2015]
    train = dff.drop(ttest.index)
    X_test = ttest.drop(axis=1, columns='Life_expectancy')
    y_test = ttest['Life_expectancy']

    X_train = train.drop(axis=1, columns='Life_expectancy')
    y_train = train['Life_expectancy']
    
    lr.fit(X_train, y_train)
    pred = lr.predict(X_test)
    
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    r_score = r2_score(y_test, pred)
    sns.distplot((y_test-pred),bins=50)
    plt.show()
    plt.scatter(y_test,pred)
    plt.xlabel('orig')
    plt.ylabel('pred')
    plt.show()
    coeff_dff = pd.DataFrame(lr.coef_,X_train.columns,columns=['Coefficient'])
    print(f'{coeff_dff} \n \n \n')
    print(f'pred_min  : {pred.min()} \n pred_max : {pred.max()}')
    print(f'mean abs error : {mae} \n mean squared error : {mse} \n r2_score : {r_score}')

In [None]:
country = 'Russian Federation'
linear(dff)

In [None]:
from keras import models
from keras import layers

def build_model():
    # Because we will need to instantiate
    # the same model multiple times,
    # we use a function to construct it.
    model = models.Sequential()
    model.add(layers.Dense(256, activation='relu',
                           input_shape=(X_train.shape[1],)))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    return model

In [None]:
df = df.drop(axis=1, columns='Country')
df.head()

In [None]:
ttest = df[df['Year']==2015]
train = df.drop(ttest.index)

X_test = ttest.drop(axis=1, columns='Life_expectancy')
y_test = ttest['Life_expectancy']

X_train = train.drop(axis=1, columns='Life_expectancy')
y_train = train['Life_expectancy']

In [None]:
ttest = df[df['Year']==2015]
train = df.drop(ttest.index)
X_test = np.array(ttest.drop(axis=1, columns='Life_expectancy'))
y_test = np.array(ttest['Life_expectancy'])

X_train = train.drop(axis=1, columns='Life_expectancy')
y_train = train['Life_expectancy']
model = build_model()
model.fit(X_train, y_train,epochs=10, batch_size=1, verbose=1)
val_mse, val_mae = model.evaluate(X_test, y_test, verbose=1)
print(f'mse : {val_mse} \n mae : {val_mae}')

In [None]:
import numpy as np

k = 4
num_val_samples = len(X_train) // k
num_epochs = 50
all_scores = []
for i in range(k):
    print('processing fold #', i)
    # Prepare the validation data: data from partition # k
    val_data = X_train[i * num_val_samples: (i + 1) * num_val_samples]
    val_targets = y_train[i * num_val_samples: (i + 1) * num_val_samples]

    # Prepare the training data: data from all other partitions
    partial_X_train = np.concatenate(
        [X_train[:i * num_val_samples],
         X_train[(i + 1) * num_val_samples:]],
        axis=0)
    partial_y_train = np.concatenate(
        [y_train[:i * num_val_samples],
         y_train[(i + 1) * num_val_samples:]],
        axis=0)

    # Build the Keras model (already compiled)
    model = build_model()
    # Train the model (in silent mode, verbose=0)
    model.fit(partial_X_train, partial_y_train,
              epochs=num_epochs, batch_size=1, verbose=1)
    # Evaluate the model on the validation data
    val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=1)
    all_scores.append(val_mae)
pred=model.predict(X_test, verbose=1)
val_mse, val_mae = model.evaluate(X_test, y_test, verbose=1)
sns.distplot((y_test-pred),bins=50)
plt.show()
plt.scatter(y_test,pred)
plt.xlabel('orig')
plt.ylabel('pred')
plt.show()
print(f'pred_min  : {pred.min()} \n pred_max : {pred.max()}')
print(f'mse : {val_mse} \n mae : {val_mae} \n {all_scores}')

In [None]:
pred=model.predict(X_test, verbose=1)
val_mse, val_mae = model.evaluate(X_test, y_test, verbose=1)
sns.distplot((y_test-pred),bins=50)
plt.show()
plt.scatter(y_test,pred)
plt.xlabel('orig')
plt.ylabel('pred')
plt.show()
print(f'mse : {val_mse} \n mae : {val_mae}')

In [None]:
all_scores

# Pheeeeeeeew

That is a lot but we can make it better, I will keep updating the notebook, so watchout!