# UNHCR Forcibly Displaced Populations Casptone Project

## Logistic Regression

---


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from pactools.grid_search import GridSearchCVProgressBar
from sklearn.metrics import  plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve, roc_auc_score, average_precision_score
from sklearn.datasets import make_classification
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
from sklearn import preprocessing
import warnings

In [4]:
warnings.filterwarnings('ignore')

  and should_run_async(code)


### Load the data

In [5]:
df = pd.read_csv('/Users/dayosangowawa/Desktop/GA/DSI20-lessons/projects/project-capstone/My Capstone/Forcibly displaced persons - Capstone Dayo Sangowawa/cleaned_unhcrdf_final.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country_of_origin,country_of_asylum,population_type,urban_or_rural_location,accommodation_type,female_aged_0-4 years,female_aged_5-11 years,female_aged_12-17 years,female_aged_18-59 years,female_aged_over_60_years,male_aged_0-4_years,male_aged_5-11_years,male_aged_12-17 years,male_aged_18-59_years,male_aged_over_60_years
0,0,Colombia,Aruba,ASY,Urban,I,0,0,0,0,0,0,0,0,5,0
1,1,Cuba,Aruba,ASY,Urban,I,0,0,0,0,0,0,0,0,0,0
2,2,Afghanistan,Afghanistan,IDP,Rural,I,0,14,16,37,0,0,15,17,38,5
3,3,Afghanistan,Afghanistan,IDP,Urban,Q,74,810,853,2004,140,75,840,921,2278,206
4,4,Afghanistan,Afghanistan,IDP,Urban,Q,12,130,138,321,21,12,135,147,365,33


In [6]:
df.drop(columns = ['Unnamed: 0'], inplace=True)

In [7]:
y = df.pop('accommodation_type')
X = pd.get_dummies(df, drop_first=True)

In [8]:
print('Baseline prediction: ', y.value_counts(normalize=True).max())

Baseline prediction:  0.8497186815375424


In [9]:
lb = preprocessing.LabelBinarizer()
y = lb.fit_transform(y)

In [10]:
y
#'0 'if it is 'I 'and '1' if it is 'other'. 

array([[0],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [11]:
y = y.ravel()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=1)

In [13]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Lasso Regularization - GridSearch

In [15]:
lr_lasso = LogisticRegression(penalty = 'l1', max_iter=30)

In [16]:
params = {'C': np.linspace(1, 2, 10), 
        'solver' :['liblinear', 'saga']}

In [17]:
lr_lasso_gs = GridSearchCV(estimator=lr_lasso, param_grid=params, cv=5)

In [None]:
lr_lasso_gs.fit(X_train, y_train)

In [None]:
lr_lasso_gs.best_params_

### Using the best parameters from the Grid Search - Lasso

In [None]:
lr_lasso_best = LogisticRegression(penalty = 'l1', C = 1.0, solver='liblinear')

#lasso

In [None]:
lr_lasso_best.fit(X_train, y_train)

In [None]:
print('Accuracy score - train: ', lr_lasso_best.score(X_train, y_train))
print('Accuracy score - test: ', lr_lasso_best.score(X_test, y_test))
print('Mean CV score - train: ', cross_val_score(lr_lasso_best, X_train, y_train, cv=5).mean())

### Feature Importances

In [None]:
feat_import = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_lasso_best.coef_[0]
})

feat_import.sort_values('Coefficient', ascending=False, inplace=True)
feat_import

In [None]:
feat_import.head(10)

In [None]:
feat_import.tail(10)

In [None]:
feat_import.head(10).plot(kind='barh', x='Feature', y='Coefficient', figsize=(8, 12), color='#FC5A50')
plt.xlabel('Coefficient', fontsize=13)
plt.ylabel('Feature', fontsize=13)
plt.title('The Importance of Each Feature in Predicting Accommodation Type Allocation', fontsize=18, pad=20)
plt.grid()
plt.show();

In [None]:
feat_import.tail(10).plot(kind='barh', x='Feature', y='Coefficient', figsize=(8, 12), color='#FC5A50')
plt.xlabel('Coefficient', fontsize=13)
plt.ylabel('Feature', fontsize=13)
plt.title('The Importance of Each Feature in Predicting Accommodation Type Allocation', fontsize=18, pad=20)
plt.grid()
plt.show();

### Evaluation

In [None]:
import scikitplot as skplt

In [None]:
plot_confusion_matrix(lr_lasso_best, X_test, y_test, cmap='Blues', labels=[1, 0], values_format='.0f')
plt.show()

In [None]:
y_test_pred_lasso = lr_lasso_best.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_lasso))

In [None]:
print(f1_score(y_test, y_test_pred_lasso))

In [None]:
probabilities_train_lasso = lr_lasso_best.predict_proba(X_train)

In [None]:
from matplotlib.colors import ListedColormap
cmap = ListedColormap(sns.color_palette("husl", len(lr_lasso_best.classes_)))

In [None]:
skplt.metrics.plot_precision_recall(y_train, probabilities_train_lasso, cmap=cmap)
plt.show()

In [None]:
print('Average precision score: ', average_precision_score(y_train, probabilities_train_lasso[:, 1]))

In [None]:
skplt.metrics.plot_roc(y_train, probabilities_train_lasso, cmap=cmap)
plt.show()

In [None]:
print('Area under the curve : ', roc_auc_score(y_train, probabilities_train_lasso[:, 1]))

### Ridge Regularization - GridSearch

In [None]:
lr_ridge = LogisticRegression(penalty='l2')

In [None]:
params = {'C': np.linspace(1, 2, 10), 
        'solver' :['lbfgs', 'newton-cg', 'sag']}

In [None]:
lr_ridge_gs = GridSearchCV(estimator=lr_ridge, param_grid=params, cv=5)

In [None]:
lr_ridge_gs.fit(X_train, y_train)

In [None]:
lr_ridge_gs.best_params_

### Using the best parameters from the Grid Search - Ridge

In [None]:
lr_ridge = LogisticRegression(penalty='l2', C = 1.0, solver = 'newton-cg')

In [None]:
lr_ridge.fit(X_train, y_train)

In [None]:
print('Accuracy score - train: ', lr_ridge.score(X_train, y_train))
print('Accuracy score - test: ', lr_ridge.score(X_test, y_test))
print('Mean CV score - train: ', cross_val_score(lr_ridge, X_train, y_train, cv=5).mean())

### Evaluation

In [None]:
plot_confusion_matrix(lr_ridge, X_test, y_test, cmap='Blues', labels=[1, 0], values_format='.0f')
plt.show()

In [None]:
y_test_pred_ridge = lr_ridge.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_ridge))

In [None]:
print(f1_score(y_test, y_test_pred_ridge))

In [None]:
probabilities_train_ridge = lr_ridge.predict_proba(X_train)

In [None]:
cmap = ListedColormap(sns.color_palette("husl", len(lr_ridge.classes_)))

In [None]:
skplt.metrics.plot_precision_recall(y_train, probabilities_train_ridge, cmap=cmap)
plt.show()

In [None]:
print('Average precision score: ', average_precision_score(y_train, probabilities_train_ridge[:, 1]))

In [None]:
skplt.metrics.plot_roc(y_train, probabilities_train_ridge, cmap=cmap)
plt.show()

In [None]:
print('Area under the curve: ', roc_auc_score(y_train, probabilities_train_ridge[:, 1]))

In [None]:
######Elastic net#######

In [None]:
lr_en = LogisticRegression(penalty = 'elasticnet', solver='saga')


In [None]:
params = {'C': np.linspace(1, 2, 10), 'l1_ratio' : range(0, 1)}

In [None]:
lr_en_gs = GridSearchCV(estimator=lr_en, param_grid=params, cv=5)

In [None]:
lr_en_gs.fit(X_train, y_train)

In [None]:
lr_en_gs.best_params_

In [None]:
lr_en_gs.best_score_

In [None]:
lr_en_best = LogisticRegression(penalty='elasticnet', C = 1.7777777777777777, solver = 'saga', l1_ratio=0)

In [None]:
lr_en_best.fit(X_train, y_train)

In [None]:
print('Accuracy score - train: ', lr_en_best.score(X_train, y_train))
print('Accuracy score - test: ', lr_en_best.score(X_test, y_test))
print('Mean CV score - train: ', cross_val_score(lr_en_best, X_train, y_train, cv=5).mean())