# Naive Bayes / SVM Classifier

## Import data

In [1]:
import pandas as pd 
import numpy as np
import math
import sklearn.metrics as metrics
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from imblearn.under_sampling import RandomUnderSampler

In [2]:
results = pd.read_csv('./cleaned_data/cleaned_asylum_seekers_added.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Cleaning and Transformation

In [3]:
# Log to transform data
results['origin_to_target_dist'] = results[(results['origin_to_target_dist'] != 0)]
results['log_origin_to_target_dist'] = results['origin_to_target_dist'].apply(lambda x: math.log(x))

results['Unemployment rate'] = results[(results['Unemployment rate'] != 0)]
results['log_Unemployment_rate'] = results['Unemployment rate'].apply(lambda x: math.log(x))

# Combining features into a single variable
results = results[results['HDI'] != '..']

# Convert to float type
results['HDI'] = results['HDI'].astype('float64')
results = results.dropna()
results['GDP_difference'] = results['GDP_difference'].abs() 
results['GDP_difference'] = results[(results['GDP_difference'] != 0)]
results['log_GDP_difference'] = results['GDP_difference'].apply(lambda x: math.log(x))

## Train Test Split

In [4]:
# Declaring label
label = results['accepted/rejected'] 

# col_names = ['HDI', 'log_GDP_difference', 'log_Unemployment_rate', 'log_origin_to_target_dist']
col_names = ['HDI', 'Unemployment rate', 'log_origin_to_target_dist']

features = results[col_names]

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.10, stratify=label)

## Multinomial Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", metrics.accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

### Oversampling

In [None]:
smt = SMOTE('not majority')
X_train_over, y_train_over = smt.fit_sample(X_train, y_train)
nb = MultinomialNB()
nb.fit(X_train_over, y_train_over)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

### Undersampling

In [None]:
rus = RandomUnderSampler()
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
nb = MultinomialNB()
nb.fit(X_train_under, y_train_under)
y_pred_class = nb.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

## SVM

### Finding Hyperparameters

In [None]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'kernel': ('linear', 'rbf','poly'), 'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

print(svc_param_selection(X_train, y_train, 25))

In [None]:
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', 'linear']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train) 

# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

In [None]:
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

### Oversampling

In [None]:
smt = SMOTE('not majority')
X_train_over, y_train_over = smt.fit_sample(X_train, y_train)
clf = SVC()
clf.fit(X_train_over, y_train_over)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

### Undersampling

In [None]:
rus = RandomUnderSampler()
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
clf = SVC()
clf.fit(X_train_over, y_train_over)
y_pred_class = clf.predict(X_test)
print("y_pred: ", y_pred_class)
print('-' * 40)
print("accuracy: ", accuracy_score(y_test, y_pred_class))
print('-' * 40)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 40)
print(classification_report(y_test, y_pred_class))

## Recursive Feature Elimination 
- Finding the optimal number of features

In [None]:
# col_names = ['Encoded procedure type', 'Encoded Target Country', 'Encoded Origin','HDI', 
#              'Unemployment rate', 'origin_to_target_dist']
features = results[col_names]
correlation_matrix = features.corr()

In [None]:
correlation_matrix

In [None]:
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [None]:
X = features
target = results['accepted/rejected']

rfc = MultinomialNB()
rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(15), scoring='accuracy')
rfecv.fit(X, target)

In [None]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

## GaussianNB

In [None]:
# Declaring label
label = results['accepted/rejected']

# Combining features into a single variable
results = results[results['HDI'] != '..']

# Convert to float type
results['HDI'] = results['HDI'].astype('float64')
results = results.dropna()

# Combining features into a single variable
col_names = ['Encoded procedure type', 'Encoded Target Country', 'Encoded Origin', 'HDI', 'Unemployment rate',
            'origin_to_target_dist']
# features = results.iloc[:,-3:]
features = results[col_names]

# Train the model using the training sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.20, random_state=42)

# Creating a Gaussian Classifier
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("y_pred", y_pred)
print("accuracy: ", metrics.accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))
print()
print(classification_report(y_test, y_pred))