# Decision Tree

In [25]:
# Import libraries

import pandas as pd
import numpy as np
import math
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Data Pre-Processing for Decision Tree

In [26]:
# Load the dataset
data = pd.read_csv("../cleaned_data/cleaned_asylum_seekers_added.csv")

# Remove records from HDI column where values = '..'
data = data[data['HDI'] != '..']
data['GDP_difference'] = data['GDP_difference'].abs() 
data['GDP_difference'] = data[(data['GDP_difference'] != 0)]

# Log to transform data
data['origin_to_target_dist'] = data[(data['origin_to_target_dist'] != 0)]
data['log_origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: math.log(x))

data['log_GDP_difference'] = data['GDP_difference'].apply(lambda x: math.log(x))

# Convert to float type
data['HDI'] = data['HDI'].astype('float64')

# Remove null values from all records
data = data.dropna()

data.head(1)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Successful,...,origin_to_target_dist,HDI,Unemployment rate,acceptance_rate,accepted/rejected,Encoded procedure type,Encoded Target Country,Encoded Origin,log_origin_to_target_dist,log_GDP_difference
0,2001,South Africa,Angola,G / AR,0.0,0.0,0.0,1.0,0.0,0.0,...,2001,0.61,30.896,0.0,0,1,135,3,7.601402,7.601402


In [27]:
col_names = ['Unemployment rate', 'Encoded procedure type', 'log_origin_to_target_dist'] # Declare the columns names

# Features
x = data[col_names]  

# Target Variable
y = data['accepted/rejected']

# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) # 80% training and 20% test

### Create Decision Tree classifer object

In [28]:
# optional param-> max_depth, random_state
dt = DecisionTreeClassifier()

# Train Decision Tree Classifer
dt.fit(x_train,y_train)

# Predict the response for test dataset
y_pred = dt.predict(x_test)

### Evaluate the classification tree

In [29]:
# Predict test set labels
y_pred = dt.predict(x_test)

# Compute test set accuracy  
acc = accuracy_score(y_test, y_pred)
print("Test set accuracy: ", (acc))

Test set accuracy:  0.8220455298405036


### Using entropy as criterion

In [30]:
# Instantiate dt_entropy, set 'entropy' as the information criterion
dt_entropy = DecisionTreeClassifier(criterion = 'entropy')

# Fit dt_entropy to the training set
dt_entropy.fit(x_train, y_train)

# Use dt_entropy to predict test set labels
y_pred = dt_entropy.predict(x_test)

# Evaluate accuracy_entropy
accuracy_entropy = accuracy_score(y_test, y_pred)

# Print accuracy_entropy
print('Accuracy achieved by using entropy: ', accuracy_entropy)

Accuracy achieved by using entropy:  0.8220455298405036


### Using GINI as criterion

In [31]:
# Instantiate dt_gini, set 'gini' as the information criterion
dt_gini = DecisionTreeClassifier(criterion = 'gini')

# Fit dt_gini to the training set
dt_gini.fit(x_train, y_train)

# Use dt_entropy to predict test set labels
y_pred = dt_gini.predict(x_test)

# Evaluate accuracy_gini
accuracy_gini = accuracy_score(y_test, y_pred)

# Print accuracy_gini
print('Accuracy achieved by using the gini index: ', accuracy_gini)

Accuracy achieved by using the gini index:  0.8220455298405036


### Updated Decision Tree with Hyperparameters

In [32]:
# optional param-> max_depth, random_state
dt_hp = DecisionTreeClassifier(class_weight=None, criterion='gini',
                       max_depth=15, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, random_state=None)

# Train Decision Tree Classifer
dt_hp.fit(x_train,y_train)

# Predict the response for test dataset
y_pred = dt.predict(x_test)

# Predict test set labels
y_pred = dt.predict(x_test)

# Compute test set accuracy  
acc = accuracy_score(y_test, y_pred)
print("Accuracy achieved using hyperparameters: ", (acc))

Accuracy achieved using hyperparameters:  0.8220455298405036


## Model Tuning

### Define a grid of hyperparameters

In [42]:
# params_dt = {'max_depth': [2, 3, 4], 'min_samples_leaf': [0.12, 0.14, 0.16, 0.18]}
# # 'HDI','Unemployment rate','GDP_difference','origin_to_target_dist','Encoded Origin', 'Encoded Target Country'

param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15, 20],
    'max_features': [2, 3],
    'min_samples_leaf': [2, 3, 4, 5],
    'min_samples_split': [2, 4, 8, 10, 12],
    'n_estimators': [50, 80, 100, 200]
}


In [43]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid = param_grid, 
                          cv = 5, n_jobs = -1)

# Fit grid search to data
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [44]:
# Reference from: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Extract the best params
best_params = grid_search.best_params_
print("Best Parameteres: ", best_params, "\n")

# Extract the best estimator
best_grid = grid_search.best_estimator_
print("Best Grid: ", best_grid)

Best Parameteres:  {'bootstrap': True, 'max_depth': 15, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 200} 

Best Grid:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [46]:
# Model Accuracy, how often is the classifier correct?
best_result = grid_search.best_score_
print("Accuracy:", best_result)

Accuracy: 0.8212337341648578
