In [1]:
dataset = 'http://kaggle.com/datasets/kumarajarshi/life-expectancy-who'

In [None]:
data_dir = './life-expectancy-who'

!sudo apt-get install texlive-xetex
!jupyter nbconvert --to pdf 'Classification Assignment.ipynb'

import os
os.listdir(data_dir)
!pip install skillsnetwork[regular]

!pip install opendatasets --upgrade
import opendatasets as od
od.download(dataset)

import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import accumulate

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

Password:

In [None]:
data = pd.read_csv('Life Expectancy Data.csv')
data.sample(5)

In [None]:
data.rename(columns={data.columns[3]: 'Life expectancy',
                     data.columns[4]: 'Adult mortality',
                     data.columns[5]: 'Infant deaths',
                     data.columns[7]: 'Health expenditure',
                     data.columns[9]: 'Measles',
                     data.columns[10]: 'BMI',
                     data.columns[11]: 'Under-5 deaths',
                     data.columns[14]: 'Diphtheria',
                     data.columns[15]: 'HIV/AIDS',
                     data.columns[18]: 'Thinness (1-19 years)',
                     data.columns[19]: 'Thinness (5-9 years)'},
                    inplace=True)

null_count = data.isnull().sum()
print(null_count[null_count>0].sort_values(ascending=False),
      'Number of null entries:', null_count.sum())

for column in data:
    if data[column].isnull().sum() > 0:
        median = data[column].median()
        data[column].fillna(median, inplace=True)
null_count = data.isnull().sum()
print('Number of null entries after replacement:', null_count.sum())

print('Number of duplicated rows:', data.duplicated().sum())

# Country column is categorical but not worth encoding - far too many classes and not useful for classification
data = data.drop('Country', axis=1)

# Shift target variable column to last position
first = data.pop('Status')
data.insert(len(data.columns), 'Status', first)

data.sample(5)

In [None]:
# Train-test split
x_cols = data.columns[:-1]
x_data = data[x_cols]
y_col = 'Status'
y_data = data[y_col]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, 
                                                    test_size=0.25, random_state=42)

print('Number of train samples:', x_train.shape[0])
print('Number of test samples:', x_test.shape[0])

In [None]:
# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)
x_poly = poly.transform(x_data)

print(x_train_poly.shape)
print(x_test_poly.shape)
# 20 original features + 210 polynomial features = 230 total features

In [None]:
# Scaling
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_train_s = pd.DataFrame(x_train, columns=x_data.columns)
x_test = ss.transform(x_test)
x_test_s = pd.DataFrame(x_test, columns=x_data.columns)

x_train_s.sample(5)

In [None]:
# Logistic regression - high interpretability
from sklearn.metrics import confusion_matrix
lr = LogisticRegression()
lr.fit(x_train_s, y_train)
y_pred_lr = lr.predict(x_test_s)
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(6, 6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Reds', 
            xticklabels=['Developing', 'Developed'], 
            yticklabels=['Developing', 'Developed'])
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('Confusion Matrix')

plt.show()
# Better performance for 'Developing' class but good overall

In [None]:
# L1 regularized logistic regression - high interpretability
lr_l1 = LogisticRegressionCV(Cs=10, cv=4, penalty='l1', solver='liblinear')
lr_l1.fit(x_train_s, y_train)
y_pred_lr_l1 = lr_l1.predict(x_test_s)
cm_lr = confusion_matrix(y_test, y_pred_lr_l1)
plt.figure(figsize=(6, 6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Reds', 
            xticklabels=['Developing', 'Developed'], 
            yticklabels=['Developing', 'Developed'])
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('Confusion Matrix (L1 Logistic Regression)')

plt.show()
# No notable improvement in performance over non-regularised logistic regression

In [None]:
# L2 regularized logistic regression - high interpretability
lr_l2 = LogisticRegressionCV(Cs=10, cv=4, penalty='l2', solver='liblinear')
lr_l2.fit(x_train_s, y_train)
y_pred_lr_l2 = lr_l2.predict(x_test_s)
cm_lr = confusion_matrix(y_test, y_pred_lr_l2)
plt.figure(figsize=(6, 6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Reds', 
            xticklabels=['Developing', 'Developed'], 
            yticklabels=['Developing', 'Developed'])
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('Confusion Matrix (L2 Logistic Regression)')

plt.show()
# Improvement over L1 regularisation for 'Developed' class

In [None]:
# Coefficients
coef = lr.coef_
coef_df = pd.DataFrame({'Feature': x_train_s.columns, 'Coefficient': coef[0]}).sort_values(by='Coefficient', ascending=False)

display(coef_df)
# Thinness (in young people up to 19 years) has strongest positive correlation with positive class ('Developing') of target
# variable ('Status')
# Checks out as developing countries are more likely to suffer from food scarcity
# Human Development Index in terms of income composition of resources has strongest negative correlation
# Checks out as both variables track the same characteristic but 'Status' variable is the categorical representation

In [None]:
# K nearest neighbors - low interpretability, moderate predictability
from sklearn.neighbors import KNeighborsClassifier
neighbors = [1, 2, 3, 4]
errors = []
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
for i, neighbor in enumerate(neighbors):
    row = i // 2
    col = i % 2
    knc = KNeighborsClassifier(n_neighbors=neighbor)
    knc.fit(x_train, y_train)
    y_pred_knc = knc.predict(x_test)   
    cm_knc = confusion_matrix(y_test, y_pred_knc)
    sns.heatmap(cm_knc, annot=True, fmt='d', cmap='Blues', ax=ax[row, col], 
                xticklabels=['Developing', 'Developed'], 
                yticklabels=['Developing', 'Developed'])
    ax[row, col].set_title(f'Confusion Matrix ({neighbor} Nearest Neighbours)')
    ax[row, col].set_xlabel('Predictions')
    ax[row, col].set_ylabel('Ground Truth')

plt.show()
# All very similar
# 'Developing' class predicted best when considering 2 nearest neighbours
# 'Developed' class predicted best when considering only single nearest neighbour

In [None]:
# Random forest - high predictability
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred_rfc = rfc.predict(x_test)
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
plt.figure(figsize=(6, 6))
sns.heatmap(cm_rfc, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Developing', 'Developed'], 
            yticklabels=['Developing', 'Developed'])
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('Confusion Matrix (Random Forest)')

plt.show()
# Notable improvement over logistic regression

In [None]:
# Extra trees - high predictability
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc.fit(x_train, y_train)
y_pred_etc = etc.predict(x_test)
cm_etc = confusion_matrix(y_test, y_pred_etc)
plt.figure(figsize=(6, 6))
sns.heatmap(cm_etc, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Developing', 'Developed'], 
            yticklabels=['Developing', 'Developed'])
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('Confusion Matrix (Extra Trees)')

plt.show()
# Improvement over random forest - best result of all

In [None]:
fi = rfc.feature_importances_
fi_df = pd.DataFrame({'Feature': x_train_s.columns, 'Importance': fi}).sort_values(by='Importance', ascending=False)

display(fi_df)
# Alcohol (consumption per capita in litres) has greatest effect on predictions
# Quite unexpected as did not personally associate alcohol consumption with development status of a country
# HIV/AIDS (deaths per 1,000 live births (0-4 years)) has least effect on predictions
# Implies that deaths from HIV/AIDS are not more common in developing countries compared to developed countries or vice-versa
# to any significant degree