# Adaboost Classifier

## Import Libraries

In [1]:
from sklearn.tree import DecisionTreeClassifier # Import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier # Import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd
import math

## Load dataset and clean data

In [2]:
# Load the dataset
data = pd.read_csv("../cleaned_data/cleaned_asylum_seekers_added.csv")

# Remove records from HDI column where values = '..'
data = data[data['HDI'] != '..']
data['GDP_difference'] = data['GDP_difference'].abs() 
data['GDP_difference'] = data[(data['GDP_difference'] != 0)]

# Log to transform data
data['origin_to_target_dist'] = data[(data['origin_to_target_dist'] != 0)]
data['log_origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: math.log(x))

data['log_GDP_difference'] = data['GDP_difference'].apply(lambda x: math.log(x))

# Convert to float type
data['HDI'] = data['HDI'].astype('float64')

# Remove null values from all records
data = data.dropna()

data.head(1)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Successful,...,origin_to_target_dist,HDI,Unemployment rate,acceptance_rate,accepted/rejected,Encoded procedure type,Encoded Target Country,Encoded Origin,log_origin_to_target_dist,log_GDP_difference
0,2001,South Africa,Angola,G / AR,0.0,0.0,0.0,1.0,0.0,0.0,...,2001,0.61,30.896,0.0,0,1,135,3,7.601402,7.601402


## Training the AdaBoost Classifier

In [3]:
col_names = ['Unemployment rate', 'log_origin_to_target_dist'] # Declare the columns names

# Features
x = data[col_names]  

# Target Variable
y = data['accepted/rejected']

# Instantiate decision tree
dt = DecisionTreeClassifier(random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator = dt, n_estimators=100, random_state=1)

# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 1) # 80% training and 20% test

# Fit ada to training set
ada.fit(x_train, y_train)

# Compute the probabilities
y_pred_probability = ada.predict_proba(x_test)[:, 1]

## Evaluate AdaBoost Classifier

In [4]:
# Predict test set labels
y_pred = ada.predict(x_test)

# Compute test set accuracy  
acc = accuracy_score(y_test, y_pred)
print("Test set accuracy: ", (acc), '\n')

print("Confusion Matrix:")
confusion_matrix(y_test, y_pred)

Test set accuracy:  0.8156562071087131 

Confusion Matrix:


array([[17168,    74,    40,    32],
       [ 2144,    30,    23,     9],
       [  907,    14,    10,     8],
       [  620,    18,     6,    26]], dtype=int64)

In [5]:
print("Classification Report:", "\n")
print(classification_report(y_test, y_pred))

Classification Report: 

              precision    recall  f1-score   support

           0       0.82      0.99      0.90     17314
           1       0.22      0.01      0.03      2206
           2       0.13      0.01      0.02       939
           3       0.35      0.04      0.07       670

    accuracy                           0.82     21129
   macro avg       0.38      0.26      0.25     21129
weighted avg       0.71      0.82      0.74     21129

