# Libraries
Pandas, Matplotlib, Numpy, Sklearn

In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from pprint import pprint
import numpy as np
np.set_printoptions(precision=2)

Sklearn

In [None]:
from sklearn import svm, datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import scale
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import scale
from sklearn import linear_model

# Data
Cleaning & Descriptive Statistics

## Data Import

Importing

In [None]:
url = 'https://drive.google.com/file/d/1wk5pql6ML6jIi1h7aIS3bIUVZ9M3E5xs/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
df.sample(5)

Checking the features

In [None]:
df[df["lnpercapitaconsumption"].isnull()]

Checking the role of dummy variables for the category "fuel"

In [None]:
df.iloc[0:3, 1:8]

Checking the role of dummy variables for the category "water"

In [None]:
df.iloc[0:3, 8:15]

## Data Processing

In [None]:
# for later use 
df_later = df[['training', 'percapitaconsumption', 'poor', 'h_hhsize','id_for_matlab', 'hhid', 'lncaphat_OLS', 'percapitahat_OLS']]
df_later = df_later[:23152]

Taking dummy-variable trap into consideration for linear regression

---

Don't take dummy-variable trap into consideration for RF --> Skip


In [None]:
df = df.drop(['d_crowd_lessthan1', 'd_lux_0','d_fuel_other','d_water_other','d_wall_other','d_roof_other','d_floor_other','d_h_educ_none','d_insurance_0'], axis=1)
print("After dropping: The dimensions of the dataframe are: ", df.shape)

In [None]:
df = df.drop(['training', 'percapitaconsumption', 'poor', 'h_hhsize','id_for_matlab', 'hhid', 'lncaphat_OLS', 'percapitahat_OLS'], axis=1)
df = df.drop([], axis=1)

Identifying NaN

In [None]:
df.isnull().sum().sort_values(ascending = False)

Since most of the NaN represent the outcome variable, it does not make sense to keep these observations. Hence, we are dropping all observations containing any kind of NaN. 

In [None]:
print("Before dropping NAs:")
print(df.shape)
# creating prediction set before dropping NA
dfPred = df[:23152]
print("\nPrediction Set:")
print(dfPred.shape)
# dropping NA
df = df.dropna() 
print("\nTraining Set:")
print(df.shape)

# Machine Learning Techniques
Data Splitting & Algorithms

## Data Splitting

Training Data Partitioning 
- 80% Train
- 20% Validation

In [None]:
X = df.iloc[:, df.columns!="lnpercapitaconsumption"]
y = df.loc[:, "lnpercapitaconsumption"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2 , random_state=4)

print("Dimensions of the train set: ")
print(X_train.shape)
print(y_train.shape)
print("")
print("Dimensions of the test set: ")
print(X_test.shape)
print(y_test.shape)

## Machine Learning Algorithm
Predictions Methods

### Random Forest

In [None]:
# saving n. of columns
p = np.shape(df)[1]
print(p)
rf_default = RandomForestRegressor(random_state = 4)

In [None]:
# function for model evaluation
def evaluate(model, test_features, test_labels, y_pred):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print("Model Performance:\n")
    print("Accuracy: {:0.2f}%".format(accuracy))
    print("Mean Absolute Error:", mean_absolute_error(test_labels, y_pred))
    print("Mean Squared Error:", mean_squared_error(test_labels, y_pred))
    print("Root Mean Squared Error:", np.sqrt(mean_squared_error(test_labels, y_pred)))

##### maxDepth model Grid Search

maxDepth RF Hyperparameter Tuning

In [None]:
# Random Hyperparameter Grid MaxDEPTH

# Number of trees in random forest; 4 tests: 500 to 2000 trees in steps of 500
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 6)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 60, num = 2)]
max_depth.append(None)
# Number of features to consider at every split; 4 tests
max_features = [int(p/ 2.0), 'sqrt', "log2"]
# Minimum number of samples required at each leaf node; 3 tests
min_samples_leaf = [1, 2, 4]
# Minimum number of samples required to split a node; 2 tests
min_samples_split = [2, 5, 10]
# Method of selecting samples for training each tree; 1 test
bootstrap = [True]

In [None]:
random_grid = {"n_estimators": n_estimators,
               "max_features": max_features,
               "min_samples_leaf": min_samples_leaf,
               "min_samples_split":min_samples_split,
               "bootstrap": bootstrap}

In [None]:
# Training Random Search of Hyperparameter
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores, (-> not more due to computation time!)
rf_random = RandomizedSearchCV(estimator = rf_default, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=7, random_state=4, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

In [None]:
print("Below you can see the best parameters chosen by the model:")
pprint(rf_random.best_params_)

In [None]:
# evaluate Random Search Model

best_random = rf_random.best_estimator_
pred_rf_random = rf_random.predict(X_test)
rf_random_accuracy = evaluate(best_random, X_test, y_test, pred_rf_random)

maxDepth Grid Search with Cross Validation


In [None]:
# Create the parameter grid based on the results of Random Search 

# Number of trees in random forest
n_estimators2 = [1500,1600,1700]
# Maximum number of levels in tree
max_depth = [48,50,52]
# Number of features to consider at every split
max_features2 = ["sqrt"]
# Minimum number of samples required at each leaf node
min_samples_leaf2 = [1]
# Minimum number of samples required to split a node
min_samples_split2 = [9,10,11]
# Method of selecting samples for training each tree
bootstrap2 = [True]

# 3*3*1*1*3*1 = 27 possible settings which will be tested

param_grid = {"n_estimators": n_estimators2,
               "max_features": max_features2,
               "min_samples_leaf": min_samples_leaf2,
              "min_samples_split": min_samples_split2,
               "bootstrap": bootstrap2}


In [None]:
# Generate Grid Search Model
rf_grid = GridSearchCV(estimator = rf_default, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 9)
rf_grid.fit(X_train,y_train)

In [None]:
print("Below you can see the best parameters chosen by the model:\n")
pprint(rf_grid.best_params_)

In [None]:
# evaluate Grid Search Model
best_grid = rf_grid.best_estimator_
pred_rf_grid = best_grid.predict(X_test)
rf_grid_accuracy = evaluate(best_grid, X_test, y_test, pred_rf_grid)

In [None]:
best_grid 

# save this setting as maxDepth

In [None]:
rf_maxDepth = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=11, min_weight_fraction_leaf=0.0,
                      n_estimators=1700, n_jobs=None, oob_score=False,
                      random_state=4, verbose=0, warm_start=False)
rf_maxDepth.fit(X_train,y_train)

##### lessDepth model Grid Search

lessDepth RF Hyperparameter Tuning

In [None]:
# Random Hyperparameter Grid lessDEPTH

# Number of trees in random forest; 4 tests: 500 to 2000 trees in steps of 500
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 6)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 60, num = 2)]
max_depth.append(None)
# Number of features to consider at every split; 4 tests
max_features = [int(p/ 2.0), 'sqrt', "log2"]
# Minimum number of samples required at each leaf node; 3 tests
min_samples_leaf = [1, 2, 4]
# Minimum number of samples required to split a node; 2 tests
min_samples_split = [2, 5, 10]
# Method of selecting samples for training each tree; 1 test
bootstrap = [True]



In [None]:
# Random Grid less Depth

random_grid = {"n_estimators": n_estimators,
               "max_depth": max_depth,
               "max_features": max_features,
               "min_samples_leaf": min_samples_leaf,
               "min_samples_split":min_samples_split,
               "bootstrap": bootstrap}

In [None]:
pprint(random_grid)

In [None]:
# Training Random Search of Hyperparameter
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores, (-> not more due to computation time!)
rf_random = RandomizedSearchCV(estimator = rf_default, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=7, random_state=4, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

In [None]:
print("Below you can see the best parameters chosen by the model:")
pprint(rf_random.best_params_)

In [None]:
# evaluate Random Search Model

best_random = rf_random.best_estimator_
pred_rf_random = rf_random.predict(X_test)
rf_random_accuracy = evaluate(best_random, X_test, y_test, pred_rf_random)

lessDepth Grid Search with Cross Validation


In [None]:
# Create the parameter grid based on the results of Random Search 

# Number of trees in random forest
n_estimators2 = [1500,1600,1700]
# Maximum number of levels in tree
max_depth = [48,50,52]
# Number of features to consider at every split
max_features2 = ["sqrt"]
# Minimum number of samples required at each leaf node
min_samples_leaf2 = [1]
# Minimum number of samples required to split a node
min_samples_split2 = [9,10,11]
# Method of selecting samples for training each tree
bootstrap2 = [True]

# 3*3*1*1*3*1 = 27 possible settings which will be tested

param_grid2 = {"n_estimators": n_estimators2,
               "max_features": max_features2,
               "min_samples_leaf": min_samples_leaf2,
              "min_samples_split": min_samples_split2,
               "bootstrap": bootstrap2}


In [None]:
# Generate Grid Search Model
rf_grid2 = GridSearchCV(estimator = rf_default, param_grid = param_grid2, 
                          cv = 3, n_jobs = -1, verbose = 9)
rf_grid2.fit(X_train,y_train)

In [None]:
print("Below you can see the best parameters chosen by the model:\n")
pprint(rf_grid2.best_params_)

In [None]:
# evaluate Grid Search Model
best_grid2 = rf_grid2.best_estimator_
pred_rf_grid2 = rf_grid2.predict(X_test)
evaluate(best_grid2, X_test, y_test, pred_rf_grid2)

In [None]:
best_grid2

# save this setting as lesDepth

In [None]:
rf_lessDepth = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=68, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=11, min_weight_fraction_leaf=0.0,
                      n_estimators=1900, n_jobs=None, oob_score=False,
                      random_state=4, verbose=0, warm_start=False)

rf_lessDepth.fit(X_train,y_train)

### OLS

Baseline Model

In [None]:
# Linear Regression
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

# Predict
y_ols = lm.predict(X_test)

Calculating the MSE

In [None]:
mean_squared_error(y_test, y_ols)

### Ridge Regression

Setting up empty lists and alphas (= penalty parameter)

In [None]:
n = len(X_train)

# Init alpha grid
n_grid = 100
alphas = 10**np.linspace(-2,5,n_grid).reshape(-1,1)
ridge = Ridge()

# Create empty lists to save the results
ridge_coefs = []
ridge_MSE = []

# Resetting the index
y_ridge = y_test.reset_index()

For every penalty parameter, make predictions

In [None]:
# Loop over values of alpha
for a in alphas:
    ridge.set_params(alpha = a)
    ridge.fit(X_train, y_train)

    # Saving the coefficients
    ridge_coefs.append(ridge.coef_)

    # Predicting
    y_pred = ridge.predict(X_test)

    # Appending the results
    y_pred = pd.DataFrame(y_pred, columns=["pred"])
    y_ridge = pd.concat([y_ridge, y_pred], axis=1, ignore_index = False)

y_ridge = y_ridge.drop(['index'], axis=1)

Calculate the MSE

In [None]:
# MSE
for i in range(1, len(y_ridge.columns), 1):
  ridge_MSE.append(mean_squared_error(y_ridge.iloc[:, 0], y_ridge.iloc[:, i]))

In [None]:
print("Coefficients enabling the lowest MSE: ")
print(ridge_coefs[ridge_MSE.index(min(ridge_MSE))])
print("")
print("Alpha enabling the highest lowest MSE: ")
print(alphas[ridge_MSE.index(min(ridge_MSE))])

In [None]:
fig, (ax1) = plt.subplots(1,1)
fig.suptitle('Ridge Regression Coefficients')

# Plot coefficients - absolute
ax1.plot(alphas, ridge_coefs, c='grey', alpha=0.3)
ax1.set_xscale('log')
ax1.set_xlabel('lambda'); ax1.set_ylabel('Standardized coefficients');

Visualizing the best and worst predictions. The last graph solely plots the actual lnpercapitaconsumption.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle('Predicted (x-axis) and Actual (y-axis) lnpercapitaconsumption')
ax1.scatter(y_ridge.iloc[:, ridge_MSE.index(min(ridge_MSE))+1], y_ridge["lnpercapitaconsumption"], color = "green", alpha = 0.1)
ax2.scatter(y_ridge.iloc[:, ridge_MSE.index(max(ridge_MSE))+1], y_ridge["lnpercapitaconsumption"], color = "red", alpha = 0.1)

Calculating the MSE

In [None]:
print(min(ridge_MSE))
ridgeMSE = min(ridge_MSE)

Saving best predictions for Ridge Regression

In [None]:
y_pred_ridge = y_ridge.iloc[:, ridge_MSE.index(min(ridge_MSE))+1]
y_pred_ridge

### Lasso

In [None]:
# Get MSE
def cv_lasso(X,y,a):
    # Init mse
    mse = []
    
    # Generate splits
    kf10 = KFold(n_splits=10, random_state=None, shuffle=False)
    kf10.get_n_splits(X)
    
    # Loop over splits
    for train_index, test_index in kf10.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        lasso = Lasso(alpha=a).fit(X_train, y_train)
        y_hat = lasso.predict(X_test)
        mse.append(mean_squared_error(y_test, y_hat))
    return np.mean(mse)

In [None]:
# Compute MSE over grid of alphas
n_grid = 50
alphas = np.logspace(-4,0, n_grid).reshape(-1,1)
MSE = [cv_lasso(X,y,a) for a in alphas]

In [None]:
# Find minimum alpha
alpha_min = alphas[np.argmin(MSE)]
print('Best alpha by 10fold CV:',alpha_min)
print('MSE for best alpha:', min(MSE))
lassoMSE = min(MSE)

In [None]:
# Get coefficients
coefs = []

# Loop over values of alpha
for a in alphas:
    lasso = Lasso(alpha=a).fit(scale(X), y)
    coefs.append(lasso.coef_)
coefs = np.reshape(coefs,(n_grid,-1))

In [None]:
# MSE by 10-Fold CV
plt.plot(alphas, MSE, alpha=1)
plt.xlabel('Lambda')
plt.ylabel('MSE')
plt.axvline(alpha_min, c='k', ls='--')

In [None]:
lasso = Lasso(alpha_min)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

In [None]:
# Maxime Changed that
# mean_squared_error(y_pred_lasso, results.loc["y_test"])
mean_squared_error(y_pred_lasso, y_test)

In [None]:
y_pred_lasso = pd.DataFrame(y_pred_lasso, columns=["y_pred_lasso"])
y_test.index = np.arange(0, len(y_test))

In [None]:
results = pd.merge(y_pred_lasso, results, left_index=True, right_index=True)
results

# Results

## Binarization
Converting the regression results into binary classes (poor/not_poor) based on quantile threshold

In [None]:
def convert_to_bin(y,quantile):
  # predicted poor
  y_bin = [1 if x > np.quantile(y,quantile) else 0 for x in y] 

  return y_bin

### Quantile = 0.25

In [None]:
quant = 0.20
y_pred_ridge_bin20 = convert_to_bin(y_pred_ridge,quant)
y_pred_ridge_bin20

In [None]:
# binarize using quantile = 0.25
quant = 0.25
y_test_bin25 = convert_to_bin(y_test,quant)
y_pred_lasso_bin25 = convert_to_bin(y_pred_lasso,quant)
y_pred_ridge_bin25 = convert_to_bin(y_pred_ridge,quant)
y_pred_lessDepth_bin25 = convert_to_bin(y_pred_lessDepth,quant)
y_pred_maxDepth_bin25 = convert_to_bin(y_pred_maxDepth,quant)

### Quantile = 0.30

In [None]:
# binarize using quantile = 0.30
quant = 0.30
y_test_bin30 = convert_to_bin(y_test,quant)
y_pred_lasso_bin30 = convert_to_bin(y_pred_lasso,quant)
y_pred_ridge_bin30 = convert_to_bin(y_pred_ridge,quant)
y_pred_lessDepth_bin30 = convert_to_bin(y_pred_lessDepth,quant)
y_pred_maxDepth_bin30 = convert_to_bin(y_pred_maxDepth,quant)

### Quantile = 0.20

In [None]:
# binarize using quantile = 0.20
quant = 0.20
y_test_bin20 = convert_to_bin(y_test,quant)
y_pred_lasso_bin20 = convert_to_bin(y_pred_lasso,quant)
y_pred_ridge_bin20 = convert_to_bin(y_pred_ridge,quant)
y_pred_lessDepth_bin20 = convert_to_bin(y_pred_lessDepth,quant)
y_pred_maxDepth_bin20 = convert_to_bin(y_pred_maxDepth,quant)

## Measuring Performance of Classification:
- **recall** = ratio of true positives to actual positives
- **fallout rate** = ratio of false positives to actual negatives
- **specificity** = the inverse of fallout rate
- **precision** = ratio of true positives to predicted positives
- **accuracy** = correctly guessed points divided by the total number of points

In [None]:
# function used for plotting the confusion matrix and calculating the metrics presented above

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues,
                         name_of_fig = "figure.png"):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    
    tot = len(y_true)
    TP = cm[0,0]
    TN = cm[1,1]
    FP = cm[1,0]
    FN = cm[0,1]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='Actual label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    plt.savefig(name_of_fig)
    
    totalAccuracy = (TP+TN)/(FP+FN+TP+TN)   
    povertyAccuracy = TP/(TP+FN)
    leakage = FP/(TP+FN)
    undercoverage = FN/(TP+FN)
    BPAC = TP/(TP+FN)-abs(FN/(TP+FN)-FP/(TP+FN))

    print("Total Accuracy = ", totalAccuracy)
    print("Poverty Accuracy = ", povertyAccuracy)
    print("Leakage = ", leakage)
    print("Undercoverage = ", undercoverage)
    print("BPAC = ", BPAC)

    return BPAC

### Confusion matrices for quantile = 25

In [None]:
BPAC_lasso25 = plot_confusion_matrix(y_test_bin25, y_pred_lasso_bin25, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for Lasso')

In [None]:
BPAC_ridge25 = plot_confusion_matrix(y_test_bin25, y_pred_ridge_bin25, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for Ridge')

In [None]:
ax_lessdepth25 = plot_confusion_matrix(y_test_bin25, y_pred_lessDepth_bin25, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for RF max-depth model', name_of_fig="lessDepth25")

In [None]:
ax_maxdepth25 = plot_confusion_matrix(y_test_bin25, y_pred_maxDepth_bin25, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for RF max-depth model')

In [None]:
print("BPAC_lasso with 0.25 quantile =", BPAC_lasso25)
print("BPAC_ridge with 0.25 quantile =", BPAC_ridge25)
print("BPAC_lessdepth with 0.25 quantile =", BPAC_lessdepth25)
print("BPAC_maxdepth with 0.25 quantile =", BPAC_maxdepth25)

### Confusion matrices for quantile = 20

In [None]:
BPAC_lasso20 = plot_confusion_matrix(y_test_bin20, y_pred_lasso_bin20, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for Lasso')

In [None]:
BPAC_ridge20 = plot_confusion_matrix(y_test_bin20, y_pred_ridge_bin20, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for RF max-depth model')

In [None]:
BPAC_lessdepth20 = plot_confusion_matrix(y_test_bin20, y_pred_lessDepth_bin20, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for RF max-depth model')

In [None]:
BPAC_maxdepth20 = plot_confusion_matrix(y_test_bin20, y_pred_maxDepth_bin20, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for RF max-depth model')

In [None]:
print("BPAC_lasso with 0.2 quantile =", BPAC_lasso20)
print("BPAC_ridge with 0.2 quantile =", BPAC_ridge20)
print("BPAC_lessdepth with 0.2 quantile =", BPAC_lessdepth20)
print("BPAC_maxdepth with 0.2 quantile =", BPAC_maxdepth20)

### Confusion matrices for quantile = 30

In [None]:
BPAC_lasso30 = plot_confusion_matrix(y_test_bin30, y_pred_lasso_bin30, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for Lasso')

In [None]:
BPAC_ridge30 = plot_confusion_matrix(y_test_bin30, y_pred_ridge_bin30, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for Ridge')

In [None]:
ax_lessdepth30 = plot_confusion_matrix(y_test_bin30, y_pred_lessDepth_bin30, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for RF max-depth model')

In [None]:
ax_maxdepth30 = plot_confusion_matrix(y_test_bin30, y_pred_maxDepth_bin30, classes=np.array(["poor", 'not poor']),
                      title='Confusion matrix for RF max-depth model',name_of_fig="maxDepth30")

In [None]:
print("BPAC_lasso with 0.3 quantile =", BPAC_lasso30)
print("BPAC_ridge with 0.3 quantile =", BPAC_ridge30)
print("BPAC_lessdepth with 0.3 quantile =", BPAC_lessdepth30)
print("BPAC_maxdepth with 0.3 quantile =", BPAC_maxdepth30)

### Training on complete Training Set & Predict Values

In [None]:
dfPred = pd.DataFrame(dfPred).fillna(0)

In [None]:
XPred = dfPred.iloc[:, df.columns!="lnpercapitaconsumption"]
yPred = dfPred.loc[:, "lnpercapitaconsumption"]

In [None]:
rf_Final = rf_maxDepth
rf_Final.fit(X,y)

In [None]:
y_pred_Final = rf_Final.predict(XPred)

In [None]:
Predictions = pd.DataFrame(y_pred_Final, columns=["lnpercapitaconsumption"])

In [None]:
df_Figure5 = Predictions.join(df_later, how= "right")
lncpc = df_Figure5["lnpercapitaconsumption"]
new_values =  np.exp(lncpc)
df_Figure5["percapitaconsumption"] = new_values

In [None]:
df_Figure5.to_csv('figure5_pred.csv')