# Adaboost Classifier

## Import Libraries

In [1]:
from sklearn.tree import DecisionTreeClassifier # Import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier # Import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd
import math

## Load dataset and clean data

In [2]:
# Load the dataset
data = pd.read_csv("../cleaned_data/cleaned_asylum_seekers_added.csv")
data = data[data['HDI'] != '..']
decimals = 2    

# Apply log to distance from origin to target country
data['origin_to_target_dist'] = data[(data['origin_to_target_dist'] != 0)]
data['log_origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: math.log(x))

data['Unemployment rate'] = data['Unemployment rate'].apply(lambda x: round(x, decimals))
data['origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: round(x, decimals))
data = data.dropna()

data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Successful,...,Origin_longitude,origin_to_target_dist,HDI,Unemployment rate,acceptance_rate,accepted/rejected,Encoded procedure type,Encoded Target Country,Encoded Origin,log_origin_to_target_dist
0,2001,South Africa,Angola,G / AR,0.0,0.0,0.0,1.0,0.0,0.0,...,-170.132217,2001.0,0.61,30.9,0.0,0,1,135,3,7.601402
1,2001,South Africa,Angola,G / FI,860.0,0.0,36.0,68.0,964.0,860.0,...,-170.132217,2001.0,0.61,30.9,0.892116,3,6,135,3,7.601402
2,2001,South Africa,Albania,G / AR,0.0,0.0,0.0,1.0,0.0,0.0,...,20.168331,2001.0,0.61,30.9,0.0,0,1,135,1,7.601402
3,2001,South Africa,Burundi,G / AR,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.561593,2001.0,0.61,30.9,0.0,0,1,135,25,7.601402
4,2001,South Africa,Burundi,G / FI,808.0,0.0,234.0,7.0,1049.0,808.0,...,-1.561593,2001.0,0.61,30.9,0.770257,3,6,135,25,7.601402


## Training the AdaBoost Classifier

In [17]:
col_names = ['Unemployment rate', 'log_origin_to_target_dist'] # Declare the columns names

# Features
x = data[col_names]  

# Target Variable
y = data['accepted/rejected']

# Instantiate decision tree
dt = DecisionTreeClassifier(random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator = dt, n_estimators=100, random_state=1)

# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 1) # 80% training and 20% test

# Fit ada to training set
ada.fit(x_train, y_train)

# Compute the probabilities
y_pred_probability = ada.predict_proba(x_test)[:, 1]

## Evaluate AdaBoost Classifier

In [18]:
# Predict test set labels
y_pred = ada.predict(x_test)

# Compute test set accuracy  
acc = accuracy_score(y_test, y_pred)
print("Test set accuracy: ", (acc), '\n')

print("Confusion Matrix:")
confusion_matrix(y_test, y_pred)

Test set accuracy:  0.8168567224572374 

Confusion Matrix:


array([[17699,    84,    15,    33],
       [ 2233,    41,     4,    11],
       [  852,    39,     1,    10],
       [  672,    18,    12,    24]], dtype=int64)

In [19]:
print("Classification Report:", "\n")
print(classification_report(y_test, y_pred))

Classification Report: 

              precision    recall  f1-score   support

           0       0.82      0.99      0.90     17831
           1       0.23      0.02      0.03      2289
           2       0.03      0.00      0.00       902
           3       0.31      0.03      0.06       726

    accuracy                           0.82     21748
   macro avg       0.35      0.26      0.25     21748
weighted avg       0.71      0.82      0.74     21748

