In [43]:
import pandas as pd
import sklearn
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import math
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
# from mlxtend.classifier import SoftmaxRegression

In [44]:
# Load the dataset
data = pd.read_csv("cleaned_data/cleaned_asylum_seekers_added.csv")
addition = pd.read_csv('cleaned_data/cleaned_asylum_seekers.csv')
data = data[data['HDI'] != '..']
data['HDI'] = data['HDI'].astype('float64')
data = data.dropna()
data = data[data['origin_to_target_dist'] != 0]
decimals = 2    
data['Unemployment rate'] = data['Unemployment rate'].apply(lambda x: round(x, decimals))
data['origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: round(x, decimals))
data['Total Applications'] = addition['Applied during year']
data['sq_GDP_difference'] = data['GDP_difference'].apply(lambda x: x**2 )
data['lg_origin_to_target_dist'] = data['origin_to_target_dist'].apply(lambda x: math.log(abs(x)))
data['lg_GDP_difference'] = data['GDP_difference'].apply(lambda x: math.log(abs(x)))
data['lg_tcgpc'] = data['Target_country_GDP_per_capita'].apply(lambda x: math.log(abs(x)))
data['lg_ocgpc'] = data['Origin_country_GDP_per_capita'].apply(lambda x: math.log(abs(x)))
# data['capacity'] = data['']
data['accepted/rejected'].value_counts() 

  interactivity=interactivity, compiler=compiler, result=result)


0    86242
1    11264
2     4776
3     3363
Name: accepted/rejected, dtype: int64

### Multinomial Logistic Regression

#### Softmax Implementation

In [45]:
print(data.columns.values)

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc']


In [114]:
x = data[['lg_ocgpc','Unemployment rate']]
y = data['accepted/rejected']

smote = SMOTE('not majority')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,stratify=y)
# print(y0_test.value_counts())
x_sm, y_sm = smote.fit_resample(x_train, y_train)
# print(len(y0_sm))
# print (x0_train.shape, y0_train.shape)
# print(x0.shape, y0.shape)
clf = linear_model.LogisticRegression(C=100,solver='lbfgs',class_weight='balanced', multi_class='multinomial', max_iter=1000)
clf.fit(x_sm, y_sm)
y_pred=clf.predict(x_test)
print('Accuracy:', clf.score(x_test,y_test))
print(metrics.classification_report(y_pred, y_test))
print(metrics.confusion_matrix(y_pred,y_test, labels=[0,1,2,3]))
output = pd.DataFrame([y_pred, y_test])

Accuracy: 0.4451228169814
              precision    recall  f1-score   support

           0       0.51      0.85      0.64     10326
           1       0.09      0.14      0.11      1473
           2       0.07      0.05      0.06      1432
           3       0.51      0.04      0.08      7898

    accuracy                           0.45     21129
   macro avg       0.29      0.27      0.22     21129
weighted avg       0.45      0.45      0.35     21129

[[8799  943  357  227]
 [1150  199   68   56]
 [1153  162   67   50]
 [6146  949  463  340]]


### One vs All

#### Really Low Chance (0)

In [47]:
print(data.columns.values)

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc']


In [48]:
x0 = data[['Unemployment rate', 'origin_to_target_dist','Encoded procedure type']]
y0 = data['accepted/rejected'].apply(lambda x:1 if x == 0 else 0)

# smote = SMOTE('not majority')
x0_train, x0_test, y0_train, y0_test = train_test_split(x0, y0, test_size=0.2,stratify=y0)
# print(y0_test.value_counts())
# x0_sm, y0_sm = smote.fit_resample(x0_train, y0_train)
# print(len(y0_sm))
# print (x0_train.shape, y0_train.shape)
# print(x0.shape, y0.shape)
clf0 = linear_model.LogisticRegression(C=10, solver='lbfgs',class_weight='balanced')
clf0.fit(x0_train, y0_train)
y0_pred=clf0.predict(x0_test)
print('Accuracy:', clf0.score(x0_test,y0_test))
print(metrics.classification_report(y0_pred, y0_test))
print(metrics.confusion_matrix(y0_pred,y0_test, labels=[0,1]))
output0 = pd.DataFrame([y0_pred, y0_test])

Accuracy: 0.5795825642481897
              precision    recall  f1-score   support

           0       0.41      0.19      0.26      8172
           1       0.62      0.82      0.71     12957

    accuracy                           0.58     21129
   macro avg       0.51      0.51      0.48     21129
weighted avg       0.54      0.58      0.53     21129

[[ 1585  6587]
 [ 2296 10661]]


#### Low Chance (1)

In [49]:
print(data.columns.values)

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc']


In [50]:
x1 = data[['Unemployment rate', 'origin_to_target_dist', 'Encoded procedure type']]
y1 = data['accepted/rejected'].apply(lambda x:1 if x == 1 else 0)
print(y1.value_counts())
smote = SMOTE('minority')
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2,stratify=y1)
x1_sm, y1_sm = smote.fit_resample(x1_train, y1_train)
# print (x1_train.shape, y1_train.shape)
# print(x1_sm.shape, y1_sm.shape)
clf1 = linear_model.LogisticRegression(C=10,solver='lbfgs', class_weight='balanced')
clf1.fit(x1_sm, y1_sm)
y1_pred=clf1.predict(x1_test)
# print(np.count_nonzero(y_pred))
print('Accuracy:', clf1.score(x1_test,y1_test))
print(metrics.classification_report(y1_pred, y1_test))
print(metrics.confusion_matrix(y1_pred,y1_test, labels=[0,1]))
output1 = pd.DataFrame([y1_pred, y1_test])

0    94381
1    11264
Name: accepted/rejected, dtype: int64
Accuracy: 0.6052345118084149
              precision    recall  f1-score   support

           0       0.63      0.90      0.74     13103
           1       0.43      0.12      0.19      8026

    accuracy                           0.61     21129
   macro avg       0.53      0.51      0.46     21129
weighted avg       0.55      0.61      0.53     21129

[[11819  1284]
 [ 7057   969]]


#### Medium chance (2)

In [51]:
print(data.columns.values)

['Year' 'Country / territory of asylum/residence' 'Origin'
 'RSD procedure type / level' 'decisions_recognized' 'decisions_other'
 'Rejected' 'Otherwise_closed' 'Total decisions' 'Successful'
 'Unsuccessful' 'Target_country_GDP_per_capita'
 'Origin_country_GDP_per_capita' 'GDP_difference' 'Target_latitude'
 'Target_longitude' 'Origin_latitude' 'Origin_longitude'
 'origin_to_target_dist' 'HDI' 'Unemployment rate' 'acceptance_rate'
 'accepted/rejected' 'Encoded procedure type' 'Encoded Target Country'
 'Encoded Origin' 'Total Applications' 'sq_GDP_difference'
 'lg_origin_to_target_dist' 'lg_GDP_difference' 'lg_tcgpc' 'lg_ocgpc']


In [52]:
x2 = data[['Unemployment rate', 'origin_to_target_dist', 'Encoded procedure type']]
y2 = data['accepted/rejected'].apply(lambda x:2 if x == 2 else 0)
print(y2.value_counts())
smote = SMOTE('minority')
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2,stratify=y2)
x2_sm, y2_sm = smote.fit_sample(x2_train, y2_train)
# print (x1_train.shape, y1_train.shape)
# print(x1_sm.shape, y1_sm.shape)
clf2 = linear_model.LogisticRegression(C=10,solver='lbfgs',class_weight='balanced')
clf2.fit(x2_sm, y2_sm)
y2_pred=clf2.predict(x2_test)
# print(np.count_nonzero(y_pred))
print('Accuracy:', clf2.score(x2_test,y2_test))
print(metrics.classification_report(y2_pred, y2_test))
print(metrics.confusion_matrix(y2_pred,y2_test, labels=[0,1]))
output2 = pd.DataFrame([y2_pred, y2_test])

0    100869
2      4776
Name: accepted/rejected, dtype: int64
Accuracy: 0.6075062710019404
              precision    recall  f1-score   support

           0       0.62      0.96      0.75     12953
           2       0.44      0.05      0.09      8176

    accuracy                           0.61     21129
   macro avg       0.53      0.50      0.42     21129
weighted avg       0.55      0.61      0.50     21129

[[12417     0]
 [    0     0]]


In [53]:
x3 = data[['Unemployment rate', 'origin_to_target_dist', 'Encoded procedure type']]
y3 = data['accepted/rejected'].apply(lambda x:3 if x == 3 else 0)
print(y3.value_counts())
smote = SMOTE('minority')
x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.2,stratify=y3)
x3_sm, y3_sm = smote.fit_sample(x3_train, y3_train)
# print (x1_train.shape, y1_train.shape)
# print(x1_sm.shape, y1_sm.shape)
clf3 = linear_model.LogisticRegression(C=10,solver='lbfgs',class_weight='balanced')
clf3.fit(x3_sm, y3_sm)
y3_pred=clf3.predict(x3_test)
# print(np.count_nonzero(y_pred))
print('Accuracy:', clf3.score(x3_test,y3_test))
print(metrics.classification_report(y3_pred, y3_test))
print(metrics.confusion_matrix(y3_pred,y3_test, labels=[0,1]))
output2 = pd.DataFrame([y3_pred, y3_test])

0    102282
3      3363
Name: accepted/rejected, dtype: int64
Accuracy: 0.6319750106488712
              precision    recall  f1-score   support

           0       0.64      0.97      0.77     13384
           3       0.48      0.04      0.08      7745

    accuracy                           0.63     21129
   macro avg       0.56      0.51      0.42     21129
weighted avg       0.58      0.63      0.52     21129

[[13032     0]
 [    0     0]]


In [54]:
x_check = data[['Unemployment rate', 'origin_to_target_dist', 'Encoded procedure type']]
def argmax(x_check):
    predicted = []
    zero = clf0.predict_proba(x_check)
    first = clf1.predict_proba(x_check)
    second = clf2.predict_proba(x_check)
    third = clf3.predict_proba(x_check)
    for i in range(len(zero)):
        output = [zero[i][1],first[i][1],second[i][1],third[i][1]]
        maxi = max(output)
        predicted.append(output.index(maxi))
        
    return predicted

a = argmax(x_check)

In [55]:
data['predicted'] = pd.DataFrame(a)

In [56]:
result = data[data['accepted/rejected'] == data['predicted']]['predicted']
success = result.shape[0]
total = data.shape[0]
accuracy = success/total

In [57]:
print(accuracy)

0.37159354441762504
