In [43]:
import pandas as pd
import sklearn
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import math
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
# from mlxtend.classifier import SoftmaxRegression

In [44]:
# Load the dataset
data = pd.read_csv("cleaned_data/cleaned_asylum_seekers_added.csv")
addition = pd.read_csv('cleaned_data/cleaned_asylum_seekers.csv')
data = data[data['HDI'] != '..']
data['HDI'] = data['HDI'].astype('float64')
data = data.dropna()
data = data[data['origin_to_target_dist'] != 0]
decimals = 2    
data['Unemployment rate'] = data['Unemployment rate'].apply(lambda x: round(x, decimals))
data['origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: round(x, decimals))
data['Total Applications'] = addition['Applied during year']
data['sq_GDP_difference'] = data['GDP_difference'].apply(lambda x: x**2 )
data['lg_origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: math.log(abs(x)))
data['lg_GDP_difference'] = data['GDP_difference'].apply(lambda x: math.log(abs(x)))
data['lg_tcgpc'] = data['Target_country_GDP_per_capita'].apply(lambda x: math.log(abs(x)))
data['lg_ocgpc'] = data['Origin_country_GDP_per_capita'].apply(lambda x: math.log(abs(x)))
# data['capacity'] = data['']
data['accepted/rejected'].value_counts() 

  interactivity=interactivity, compiler=compiler, result=result)


0    86242
1    11264
2     4776
3     3363
Name: accepted/rejected, dtype: int64

## Multinomial Logistic Regression

### Softmax Implementation

In [45]:
print(data.columns.values)

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc']


In [153]:
x = data[['Encoded procedure type','Unemployment rate', 'lg_origin_to_target_dist']]
y = data['accepted/rejected']

smote = SMOTE('not majority')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,stratify=y)
# print(y0_test.value_counts())
x_sm, y_sm = smote.fit_resample(x_train, y_train)
# print(len(y0_sm))
# print (x0_train.shape, y0_train.shape)
# print(x0.shape, y0.shape)
clf = linear_model.LogisticRegression(C=100,solver='lbfgs',class_weight='balanced', multi_class='multinomial', max_iter=1000)
clf.fit(x_sm, y_sm)
y_pred=clf.predict(x_test)
print('Accuracy:', clf.score(x_test,y_test))
print(metrics.classification_report(y_pred, y_test))
print(metrics.confusion_matrix(y_pred,y_test, labels=[0,1,2,3]))
output = pd.DataFrame([y_pred, y_test])

Accuracy: 0.2601637559752
              precision    recall  f1-score   support

           0       0.22      0.86      0.35      4400
           1       0.65      0.12      0.21     11938
           2       0.01      0.02      0.01       396
           3       0.36      0.06      0.10      4395

    accuracy                           0.26     21129
   macro avg       0.31      0.26      0.17     21129
weighted avg       0.49      0.26      0.21     21129

[[3773  384  146   97]
 [9536 1470  604  328]
 [ 344   40    9    3]
 [3595  359  196  245]]


### One vs All Implementation

#### Really Low Chance (0)

In [117]:
print(data.columns.values)
#'Unemployment rate', 'origin_to_target_dist','Encoded procedure type'

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc'
 'predicted']


In [145]:
x0 = data[['Encoded procedure type','Unemployment rate', 'lg_origin_to_target_dist']]
y0 = data['accepted/rejected'].apply(lambda x:1 if x == 0 else 0)

# smote = SMOTE('not majority')
x0_train, x0_test, y0_train, y0_test = train_test_split(x0, y0, test_size=0.2,stratify=y0)
# print(y0_test.value_counts())
# x0_sm, y0_sm = smote.fit_resample(x0_train, y0_train)
# print(len(y0_sm))
# print (x0_train.shape, y0_train.shape)
# print(x0.shape, y0.shape)
clf0 = linear_model.LogisticRegression(C=10, solver='lbfgs',class_weight='balanced')
clf0.fit(x0_train, y0_train)
y0_pred=clf0.predict(x0_test)
print('Accuracy:', clf0.score(x0_test,y0_test))
print(metrics.classification_report(y0_pred, y0_test))
print(metrics.confusion_matrix(y0_pred,y0_test, labels=[0,1]))
output0 = pd.DataFrame([y0_pred, y0_test])

Accuracy: 0.4694495716787354
              precision    recall  f1-score   support

           0       0.60      0.19      0.29     12009
           1       0.44      0.83      0.57      9120

    accuracy                           0.47     21129
   macro avg       0.52      0.51      0.43     21129
weighted avg       0.53      0.47      0.42     21129

[[2340 9669]
 [1541 7579]]


#### Low Chance (1)

In [49]:
print(data.columns.values)

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc']


In [146]:
x1 = data[['Encoded procedure type','Unemployment rate', 'lg_origin_to_target_dist']]
y1 = data['accepted/rejected'].apply(lambda x:1 if x == 1 else 0)
print(y1.value_counts())
smote = SMOTE('minority')
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2,stratify=y1)
x1_sm, y1_sm = smote.fit_resample(x1_train, y1_train)
# print (x1_train.shape, y1_train.shape)
# print(x1_sm.shape, y1_sm.shape)
clf1 = linear_model.LogisticRegression(C=10,solver='lbfgs', class_weight='balanced')
clf1.fit(x1_sm, y1_sm)
y1_pred=clf1.predict(x1_test)
# print(np.count_nonzero(y_pred))
print('Accuracy:', clf1.score(x1_test,y1_test))
print(metrics.classification_report(y1_pred, y1_test))
print(metrics.confusion_matrix(y1_pred,y1_test, labels=[0,1]))
output1 = pd.DataFrame([y1_pred, y1_test])

0    94381
1    11264
Name: accepted/rejected, dtype: int64
Accuracy: 0.4536892422736523
              precision    recall  f1-score   support

           0       0.43      0.91      0.58      8921
           1       0.65      0.12      0.20     12208

    accuracy                           0.45     21129
   macro avg       0.54      0.52      0.39     21129
weighted avg       0.56      0.45      0.36     21129

[[ 8127   794]
 [10749  1459]]


#### Medium chance (2)

In [51]:
print(data.columns.values)

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc']


In [147]:
x2 = data[['Encoded procedure type','Unemployment rate', 'lg_origin_to_target_dist']]
y2 = data['accepted/rejected'].apply(lambda x:2 if x == 2 else 0)
print(y2.value_counts())
smote = SMOTE('minority')
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2,stratify=y2)
x2_sm, y2_sm = smote.fit_sample(x2_train, y2_train)
# print (x1_train.shape, y1_train.shape)
# print(x1_sm.shape, y1_sm.shape)
clf2 = linear_model.LogisticRegression(C=10,solver='lbfgs',class_weight='balanced')
clf2.fit(x2_sm, y2_sm)
y2_pred=clf2.predict(x2_test)
# print(np.count_nonzero(y_pred))
print('Accuracy:', clf2.score(x2_test,y2_test))
print(metrics.classification_report(y2_pred, y2_test))
print(metrics.confusion_matrix(y2_pred,y2_test, labels=[0,1]))
output2 = pd.DataFrame([y2_pred, y2_test])

0    100869
2      4776
Name: accepted/rejected, dtype: int64
Accuracy: 0.4484831274551564
              precision    recall  f1-score   support

           0       0.44      0.96      0.60      9301
           2       0.59      0.05      0.09     11828

    accuracy                           0.45     21129
   macro avg       0.52      0.50      0.35     21129
weighted avg       0.53      0.45      0.32     21129

[[8911    0]
 [   0    0]]


#### High Chance (3)

In [148]:
x3 = data[['Encoded procedure type','Unemployment rate', 'lg_origin_to_target_dist']]
y3 = data['accepted/rejected'].apply(lambda x:3 if x == 3 else 0)
print(y3.value_counts())
smote = SMOTE('minority')
x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.2,stratify=y3)
x3_sm, y3_sm = smote.fit_sample(x3_train, y3_train)
# print (x1_train.shape, y1_train.shape)
# print(x1_sm.shape, y1_sm.shape)
clf3 = linear_model.LogisticRegression(C=10,solver='lbfgs',class_weight='balanced')
clf3.fit(x3_sm, y3_sm)
y3_pred=clf3.predict(x3_test)
# print(np.count_nonzero(y_pred))
print('Accuracy:', clf3.score(x3_test,y3_test))
print(metrics.classification_report(y3_pred, y3_test))
print(metrics.confusion_matrix(y3_pred,y3_test, labels=[0,1]))
output2 = pd.DataFrame([y3_pred, y3_test])

0    102282
3      3363
Name: accepted/rejected, dtype: int64
Accuracy: 0.7177812485209901
              precision    recall  f1-score   support

           0       0.73      0.97      0.83     15333
           3       0.38      0.04      0.08      5796

    accuracy                           0.72     21129
   macro avg       0.55      0.51      0.46     21129
weighted avg       0.63      0.72      0.63     21129

[[14913     0]
 [    0     0]]


#### Getting Argmax from the various classes

In [149]:
x_check = data[['Encoded procedure type','Unemployment rate', 'lg_origin_to_target_dist']]
def argmax(x_check):
    predicted = []
    zero = clf0.predict_proba(x_check)
    first = clf1.predict_proba(x_check)
    second = clf2.predict_proba(x_check)
    third = clf3.predict_proba(x_check)
    for i in range(len(zero)):
        output = [zero[i][1],first[i][1],second[i][1],third[i][1]]
        maxi = max(output)
        predicted.append(output.index(maxi))
        
    return predicted

a = argmax(x_check)

In [150]:
data['predicted'] = pd.DataFrame(a)

In [151]:
result = data[data['accepted/rejected'] == data['predicted']]['predicted']
success = result.shape[0]
total = data.shape[0]
accuracy = success/total

In [152]:
print(accuracy)

0.2832410431160964
