# Random Forest Model

## Import Libraries

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier # Import Random Forest Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Import and preprocess file

In [2]:
# Load the dataset
data = pd.read_csv("../cleaned_data/cleaned_asylum_seekers_added.csv")

# Remove records from HDI column where values = '..'
data = data[data['HDI'] != '..']
data['GDP_difference'] = data['GDP_difference'].abs() 
data['GDP_difference'] = data[(data['GDP_difference'] != 0)]

# Log to transform data
data['origin_to_target_dist'] = data[(data['origin_to_target_dist'] != 0)]
data['log_origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: math.log(x))

data['log_GDP_difference'] = data['GDP_difference'].apply(lambda x: math.log(x))

# Convert to float type
data['HDI'] = data['HDI'].astype('float64')

# Remove null values from all records
data = data.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#'Encoded Target Country', 'Encoded procedure type'
col_names = ['Unemployment rate', 'Encoded procedure type', 'log_origin_to_target_dist'] # Declare the columns names

# Features 
features = data[col_names]

 # Target Variable
class_label = data['accepted/rejected']

In [4]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(features, class_label, test_size = 0.2, random_state = 1) # 80% training and 20% test

In [5]:
# Create the model with 100 trees
#max_features=sqrt(n_features) for classification tasks

model = RandomForestClassifier(n_estimators = 100, 
                               bootstrap = True,
                               max_features = 'sqrt')

# Fit on training data
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
y_predict = model.predict(x_test)

print("Accuracy: ", accuracy_score(y_test, y_predict))

print("Confusion Matrix:")
confusion_matrix(y_test, y_predict)

Accuracy:  0.8218088882578447
Confusion Matrix:


array([[17124,   106,    34,    50],
       [ 2079,    75,    28,    24],
       [  857,    43,    17,    22],
       [  492,    21,     9,   148]], dtype=int64)

In [7]:
print("Classification Report:", "\n")
print(classification_report(y_test, y_predict))

Classification Report: 

              precision    recall  f1-score   support

           0       0.83      0.99      0.90     17314
           1       0.31      0.03      0.06      2206
           2       0.19      0.02      0.03       939
           3       0.61      0.22      0.32       670

    accuracy                           0.82     21129
   macro avg       0.48      0.32      0.33     21129
weighted avg       0.74      0.82      0.76     21129

