<a href="https://colab.research.google.com/github/crsimmons1/lifeexpectancy/blob/master/Categorical/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Classification Tree**
This file uses tree models to classify whether the Total Expenditure (general government expenditure on health as a percentage of total government expenditure (%)) was low or high for various countries for the years 2000-2015. The threshold chosen to distinguish low(1)  and high(1) is 5.8%, since it is the median value and keeps the classes balanced.  

This data can be found [here](https://www.kaggle.com/kumarajarshi/life-expectancy-who) on Kaggle. Please see this git [repository](https://github.com/crsimmons1/lifeexpectancy) for more information on the data cleaning that was done. 


### Import Data


In [25]:
import pandas as pd
import numpy as np

data= pd.read_csv("cleaned_data.csv")

# Split in X and y 
y = pd.DataFrame(data.TExp)
X = data.drop(columns=['TExpenditure', 'Expenditure','TExp', 'Year'])

#Train-test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=33)

data.head(5)

Unnamed: 0,Status,Year,LifeExpectancy,AdultMortality,InfantDeaths,Alcohol,Expenditure,HepB,Measles,BMI,5deaths,Polio,TExpenditure,Diphtheria,HIV,GDP,Population,ThinJuvenile,ThinChild,IncomeComp,Schooling,TExp
0,0,2015.0,65.0,263.0,62.0,0.01,71.279624,65.0,1154.0,19.1,83.0,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,1
1,0,2014.0,59.9,271.0,64.0,0.01,73.523582,62.0,492.0,18.6,86.0,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,1
2,0,2013.0,59.9,268.0,66.0,0.01,73.219243,64.0,430.0,18.1,89.0,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,1
3,0,2012.0,59.5,272.0,69.0,0.01,78.184215,67.0,2787.0,17.6,93.0,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,1
4,0,2011.0,59.2,275.0,71.0,0.01,7.097109,68.0,3013.0,17.2,97.0,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,1


In [26]:
X_test.shape

(507, 18)

In [27]:
X_train.shape

(2025, 18)

In [28]:
y_train.shape

(2025, 1)

## Simple Tree

First, build a simple tree for the sake of comparison to random forest. No parameters are tuned, and it is not expected to perform well. 

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
tree = DecisionTreeClassifier(criterion='gini')

In [31]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [0]:
from sklearn.tree import export_graphviz

In [0]:
tree_vis = export_graphviz(tree, out_file=None, 
                           filled=True, rounded=True, special_characters=True)

In [0]:
from IPython.display import SVG
from graphviz import Source
#Source(export_graphviz(tree, out_file=None, filled = True, rounded = True,feature_names=X_train.columns))

In [0]:
# Make Predictions
training_preds = tree.predict(X_train)
testing_preds = tree.predict(X_test)

In [36]:
# Accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, testing_preds)

print("Train accuracy: "+ str(train_accuracy))
print("Test accuracy: "+ str(test_accuracy))

Train accuracy: 1.0
Test accuracy: 0.7712031558185405


In [0]:
# Precision and Accuracy 
from sklearn.metrics import classification_report
print(classification_report(y_test, testing_preds)

## Random Forest
First, build a random forest tree for the sake of comparison to the one with tuned parameters. No parameters are set, as it acts as a benchmark for the grid search model(although it is an improvement against the simple tree). 
### Initial Random Forest Model

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
rf = RandomForestClassifier(random_state=0)

In [39]:
rf.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, rf.predict(X_test))

0.8461538461538461

### Tune RF parameters with GridSearch

In [0]:
from sklearn.model_selection import GridSearchCV 

max_depth = list(np.arange(1,10))
max_depth.append(None)

param_grid = { 
    'max_depth': max_depth,
    'n_estimators': np.arange(100, 1100, 200), 
    'min_samples_split': [2,4,8,16],
    'min_samples_leaf': [1, 2, 4]
}

In [42]:
np.arange(100, 1000, 100)

array([100, 200, 300, 400, 500, 600, 700, 800, 900])

In [0]:
rf = RandomForestClassifier()

In [0]:
grid = GridSearchCV(estimator = rf, param_grid=param_grid, 
                          cv = 3, verbose = 51, n_jobs=-1)

In [0]:
import time

In [46]:
t_start = time.time()
grid.fit(X_train, y_train)
t_end = time.time()

Fitting 3 folds for each of 600 candidates, totalling 1800 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 

  self.best_estimator_.fit(X, y, **fit_params)


In [47]:
best = grid.best_estimator_
best

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
#Check our accuracy on this estimator:
best.fit(X_train, y_train)
accuracy_score(y_test, best.predict(X_test))

  


0.8461538461538461

In [0]:
# Make Predictions
training_preds = best.predict(X_train)
testing_preds = best.predict(X_test)

In [50]:
# Accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, testing_preds)

print("Train accuracy: "+ str(train_accuracy))
print("Test accuracy: "+ str(test_accuracy))

Train accuracy: 1.0
Test accuracy: 0.8461538461538461


In [52]:
# Precision and Accuracy 
from sklearn.metrics import classification_report
print(classification_report(y_test, testing_preds))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85       269
           1       0.83      0.85      0.84       238

    accuracy                           0.85       507
   macro avg       0.85      0.85      0.85       507
weighted avg       0.85      0.85      0.85       507

