# Naive Bayes / SVM Classifier

## Import Libraries and Data

In [1]:
import pandas as pd 
import numpy as np
import math
import sklearn.metrics as metrics
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Load Dataset

results = pd.read_csv('../cleaned_data/cleaned_asylum_seekers_added.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Cleaning and Transformation

In [3]:
# Remove records from HDI column where values = '..'
results = results[results['HDI'] != '..']
results['GDP_difference'] = results['GDP_difference'].abs() 
results['GDP_difference'] = results[(results['GDP_difference'] != 0)]

# Log to transform data
results['origin_to_target_dist'] = results[(results['origin_to_target_dist'] != 0)]
results['log_origin_to_target_dist'] = results['origin_to_target_dist'].apply(lambda x: math.log(x))

results['log_GDP_difference'] = results['GDP_difference'].apply(lambda x: math.log(x))

# Convert to float type
results['HDI'] = results['HDI'].astype('float64')

# Remove null values from all records
results = results.dropna()

results.head(1)

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Successful,...,origin_to_target_dist,HDI,Unemployment rate,acceptance_rate,accepted/rejected,Encoded procedure type,Encoded Target Country,Encoded Origin,log_origin_to_target_dist,log_GDP_difference
0,2001,South Africa,Angola,G / AR,0.0,0.0,0.0,1.0,0.0,0.0,...,2001,0.61,30.896,0.0,0,1,135,3,7.601402,7.601402


## Train Test Split

In [4]:
# Declaring label
label = results['accepted/rejected'] 

# col_names = ['HDI', 'log_GDP_difference', 'log_Unemployment_rate', 'log_origin_to_target_dist']
col_names = ['Unemployment rate', 'log_origin_to_target_dist']

features = results[col_names]

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.10, stratify=label)

## Multinomial Naive Bayes

In [5]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", metrics.accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [0 0 0 ... 0 0 0]
----------------------------------------
accuracy:  0.8163748225272125
----------------------------------------
[[8625    0    0    0]
 [1126    0    0    0]
 [ 478    0    0    0]
 [ 336    0    0    0]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      8625
           1       0.00      0.00      0.00      1126
           2       0.00      0.00      0.00       478
           3       0.00      0.00      0.00       336

    accuracy                           0.82     10565
   macro avg       0.20      0.25      0.22     10565
weighted avg       0.67      0.82      0.73     10565



  'precision', 'predicted', average, warn_for)


### Oversampling

In [6]:
smt = SMOTE('not majority')
X_train_over, y_train_over = smt.fit_sample(X_train, y_train)
nb = MultinomialNB()
nb.fit(X_train_over, y_train_over)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [2 0 0 ... 0 2 2]
----------------------------------------
accuracy:  0.36393752957879794
----------------------------------------
[[3548   18 4741  318]
 [ 393    3  667   63]
 [ 167    0  290   21]
 [ 152    2  178    4]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.41      0.55      8625
           1       0.13      0.00      0.01      1126
           2       0.05      0.61      0.09       478
           3       0.01      0.01      0.01       336

    accuracy                           0.36     10565
   macro avg       0.26      0.26      0.16     10565
weighted avg       0.70      0.36      0.45     10565



### Undersampling

In [7]:
rus = RandomUnderSampler()
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
nb = MultinomialNB()
nb.fit(X_train_under, y_train_under)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [2 0 0 ... 0 2 2]
----------------------------------------
accuracy:  0.36393752957879794
----------------------------------------
[[3532   43 4750  300]
 [ 388   11  668   59]
 [ 165    5  290   18]
 [ 143    2  179   12]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.41      0.55      8625
           1       0.18      0.01      0.02      1126
           2       0.05      0.61      0.09       478
           3       0.03      0.04      0.03       336

    accuracy                           0.36     10565
   macro avg       0.27      0.27      0.17     10565
weighted avg       0.70      0.36      0.46     10565



## SVM

### Finding Hyperparameters

In [8]:
# def svc_param_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'kernel': ('linear', 'rbf','poly'), 'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
#     grid_search.fit(X, y)
#     grid_search.best_params_
#     return grid_search.best_params_

# print(svc_param_selection(X_train, y_train, 10))

### SVM model

In [9]:
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [0 0 0 ... 0 0 0]
----------------------------------------
accuracy:  0.8163748225272125
----------------------------------------
[[8625    0    0    0]
 [1126    0    0    0]
 [ 478    0    0    0]
 [ 336    0    0    0]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      8625
           1       0.00      0.00      0.00      1126
           2       0.00      0.00      0.00       478
           3       0.00      0.00      0.00       336

    accuracy                           0.82     10565
   macro avg       0.20      0.25      0.22     10565
weighted avg       0.67      0.82      0.73     10565



  'precision', 'predicted', average, warn_for)


### Oversampling

In [10]:
smt = SMOTE('not majority')
X_train_over, y_train_over = smt.fit_sample(X_train, y_train)
clf = SVC(gamma='auto')
clf.fit(X_train_over, y_train_over)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [2 0 0 ... 0 0 1]
----------------------------------------
accuracy:  0.30004732607666823
----------------------------------------
[[2497 2395 2123 1610]
 [ 245  404  300  177]
 [ 109  129  149   91]
 [  68   64   84  120]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.86      0.29      0.43      8625
           1       0.14      0.36      0.20      1126
           2       0.06      0.31      0.10       478
           3       0.06      0.36      0.10       336

    accuracy                           0.30     10565
   macro avg       0.28      0.33      0.21     10565
weighted avg       0.72      0.30      0.38     10565



### Undersampling

In [None]:
rus = RandomUnderSampler()
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
clf = SVC(gamma='auto')
clf.fit(X_train_over, y_train_over)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

## Recursive Feature Elimination 
- Finding the optimal number of features

In [None]:
# col_names = ['Encoded procedure type', 'Encoded Target Country', 'Encoded Origin','HDI', 
#              'Unemployment rate', 'origin_to_target_dist']
features = results[col_names]
correlation_matrix = features.corr()

In [None]:
correlation_matrix

In [None]:
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [None]:
X = features
target = results['accepted/rejected']

rfc = MultinomialNB()
rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(15), scoring='accuracy')
rfecv.fit(X, target)

In [None]:
print('Optimal number of features: {}'.format(rfecv.n_features_))