# Naive Bayes / SVM Classifier

## Import data

In [1]:
import pandas as pd 
import numpy as np
import math
import sklearn.metrics as metrics
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from imblearn.under_sampling import RandomUnderSampler

In [2]:
results = pd.read_csv('./cleaned_data/cleaned_asylum_seekers_added.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Cleaning and Transformation

In [3]:
# Log to transform data
results['origin_to_target_dist'] = results[(results['origin_to_target_dist'] != 0)]
results['log_origin_to_target_dist'] = results['origin_to_target_dist'].apply(lambda x: math.log(x))

results['Unemployment rate'] = results[(results['Unemployment rate'] != 0)]
results['log_Unemployment_rate'] = results['Unemployment rate'].apply(lambda x: math.log(x))

# Combining features into a single variable
results = results[results['HDI'] != '..']

# Convert to float type
results['HDI'] = results['HDI'].astype('float64')
results = results.dropna()
results['GDP_difference'] = results['GDP_difference'].abs() 
results['GDP_difference'] = results[(results['GDP_difference'] != 0)]
results['log_GDP_difference'] = results['GDP_difference'].apply(lambda x: math.log(x))

## Train Test Split

In [4]:
# Declaring label
label = results['accepted/rejected'] 

# col_names = ['HDI', 'log_GDP_difference', 'log_Unemployment_rate', 'log_origin_to_target_dist']
col_names = ['Unemployment rate', 'log_origin_to_target_dist']

features = results[col_names]

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.10, stratify=label)

## Multinomial Naive Bayes

In [5]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", metrics.accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [0 0 0 ... 0 0 0]
----------------------------------------
accuracy:  0.8166808189451835
----------------------------------------
[[8656    0    0    0]
 [1128    0    0    0]
 [ 478    0    0    0]
 [ 337    0    0    0]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      8656
           1       0.00      0.00      0.00      1128
           2       0.00      0.00      0.00       478
           3       0.00      0.00      0.00       337

    accuracy                           0.82     10599
   macro avg       0.20      0.25      0.22     10599
weighted avg       0.67      0.82      0.73     10599



  'precision', 'predicted', average, warn_for)


### Oversampling

In [6]:
smt = SMOTE('not majority')
X_train_over, y_train_over = smt.fit_sample(X_train, y_train)
nb = MultinomialNB()
nb.fit(X_train_over, y_train_over)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [1 3 1 ... 1 0 0]
----------------------------------------
accuracy:  0.45409944334371166
----------------------------------------
[[4577  456    0 3623]
 [ 515   75    0  538]
 [ 237   35    0  206]
 [ 158   18    0  161]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.53      0.65      8656
           1       0.13      0.07      0.09      1128
           2       0.00      0.00      0.00       478
           3       0.04      0.48      0.07       337

    accuracy                           0.45     10599
   macro avg       0.25      0.27      0.20     10599
weighted avg       0.70      0.45      0.54     10599



  'precision', 'predicted', average, warn_for)


### Undersampling

In [7]:
rus = RandomUnderSampler()
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
nb = MultinomialNB()
nb.fit(X_train_under, y_train_under)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [2 3 2 ... 2 0 0]
----------------------------------------
accuracy:  0.45032550240588737
----------------------------------------
[[4577    0  456 3623]
 [ 515    0   75  538]
 [ 237    0   35  206]
 [ 158    0   18  161]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.53      0.65      8656
           1       0.00      0.00      0.00      1128
           2       0.06      0.07      0.07       478
           3       0.04      0.48      0.07       337

    accuracy                           0.45     10599
   macro avg       0.23      0.27      0.19     10599
weighted avg       0.69      0.45      0.53     10599



  'precision', 'predicted', average, warn_for)


## SVM

### Finding Hyperparameters

In [8]:
# def svc_param_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'kernel': ('linear', 'rbf','poly'), 'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
#     grid_search.fit(X, y)
#     grid_search.best_params_
#     return grid_search.best_params_

# print(svc_param_selection(X_train, y_train, 10))

### SVM model

In [None]:
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

y_pred:  [0 0 0 ... 0 0 0]
----------------------------------------
accuracy:  0.8166808189451835
----------------------------------------
[[8656    0    0    0]
 [1128    0    0    0]
 [ 478    0    0    0]
 [ 337    0    0    0]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      8656
           1       0.00      0.00      0.00      1128
           2       0.00      0.00      0.00       478
           3       0.00      0.00      0.00       337

    accuracy                           0.82     10599
   macro avg       0.20      0.25      0.22     10599
weighted avg       0.67      0.82      0.73     10599



  'precision', 'predicted', average, warn_for)


### Oversampling

In [None]:
smt = SMOTE('not majority')
X_train_over, y_train_over = smt.fit_sample(X_train, y_train)
clf = SVC(gamma='auto')
clf.fit(X_train_over, y_train_over)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

### Undersampling

In [None]:
rus = RandomUnderSampler()
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
clf = SVC(gamma='auto')
clf.fit(X_train_over, y_train_over)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

## Recursive Feature Elimination 
- Finding the optimal number of features

In [None]:
# col_names = ['Encoded procedure type', 'Encoded Target Country', 'Encoded Origin','HDI', 
#              'Unemployment rate', 'origin_to_target_dist']
features = results[col_names]
correlation_matrix = features.corr()

In [None]:
correlation_matrix

In [None]:
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [None]:
X = features
target = results['accepted/rejected']

rfc = MultinomialNB()
rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(15), scoring='accuracy')
rfecv.fit(X, target)

In [None]:
print('Optimal number of features: {}'.format(rfecv.n_features_))