# 1. Data preprocessing and modeling


In [1]:
import pandas as pd
import numpy as np
import os
from joblib import dump
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from pandas.api.types import CategoricalDtype

np.random.seed(1111)
pd.options.mode.chained_assignment = None

## Configuration

In [2]:
# Selecting the country for based on which the models are trained because some features are country-specific.
country = 'EE'

## Obtaining the data

The dataset is quite large so it should be only downloaded once. However if you plan to update your models once in a while then consider re-downloading it once per month as the dataset is usually updated daily. After downloading, the file is extracted and loaded into a DataFrame object and is also sorted in chronological order.

In [3]:
def download():
    import zipfile
    from requests import get
    url = "https://bondora.com/marketing/media/LoanData.zip"

    with open("/data/Data.zip", "wb") as file:
        response = get(url)
        file.write(response.content)

    with zipfile.ZipFile("/data/Data.zip","r") as zip_ref:
        zip_ref.extractall("data/")

#download()

In [None]:
data = pd.read_csv("data/LoanData.csv")
data = data.sort_values(by=['ListedOnUTC'])

## Inspecting the data

We can see that the dataset contains ~250000 loans and has 112 different columns. The earliest loans are from 2009.

In [5]:
print(f"DataFrame shape: {data.shape}")

DataFrame shape: (249251, 112)


In [6]:
data.head(5)

Unnamed: 0,ReportAsOfEOD,LoanId,LoanNumber,ListedOnUTC,BiddingStartedOn,BidsPortfolioManager,BidsApi,BidsManual,PartyId,NewCreditCustomer,...,PreviousEarlyRepaymentsCountBeforeLoan,GracePeriodStart,GracePeriodEnd,NextPaymentDate,NextPaymentNr,NrOfScheduledPayments,ReScheduledOn,PrincipalDebtServicingCost,InterestAndPenaltyDebtServicingCost,ActiveLateLastPaymentCategory
1364,2022-08-10,FA160D69-2682-4A60-8D8E-9BB700EA30CE,37,2009-02-21 14:12:39,2009-02-21 14:12:39,0,0.0,63.9118,{544DFBAC-374F-4039-AE45-9BB700E44853},True,...,0.0,,,,,,,0.0,0.0,
1365,2022-08-10,8E929B92-7C99-421D-8499-9BB70125F390,42,2009-02-21 17:50:14,2009-02-21 17:50:14,0,0.0,83.0851,{B7FDCB11-4CE9-4CDE-993F-9BB70103B180},True,...,0.0,,,,,,,0.0,0.0,
1366,2022-08-10,33B3F669-D0E3-4474-8045-9BB70128D064,43,2009-02-21 18:00:40,2009-02-21 18:00:40,0,0.0,322.7539,{E58803E6-77B6-40EB-83C4-9BB70118C245},True,...,0.0,,,,,,,0.0,0.0,
1367,2022-08-10,7074D9E8-E8F5-403B-8614-9BB701338AD4,44,2009-02-21 18:39:43,2009-02-21 18:39:43,0,0.0,252.451,{DE67C0EB-7534-47F2-87BD-9BB7011E112A},True,...,0.0,,,,,,,0.0,0.0,
1368,2022-08-10,39F2A312-CD6C-4F60-A7F6-9BB7014CE0A9,45,2009-02-21 20:11:58,2009-02-21 20:11:58,0,0.0,63.9116,{C41C37A5-B2D7-4B5A-B2D2-9BB70149B419},True,...,0.0,,,,,,,0.0,0.0,


In [7]:
data.tail(5)

Unnamed: 0,ReportAsOfEOD,LoanId,LoanNumber,ListedOnUTC,BiddingStartedOn,BidsPortfolioManager,BidsApi,BidsManual,PartyId,NewCreditCustomer,...,PreviousEarlyRepaymentsCountBeforeLoan,GracePeriodStart,GracePeriodEnd,NextPaymentDate,NextPaymentNr,NrOfScheduledPayments,ReScheduledOn,PrincipalDebtServicingCost,InterestAndPenaltyDebtServicingCost,ActiveLateLastPaymentCategory
248495,2022-08-10,0424EDB9-E0EF-4336-BA28-AEEC0131EEFA,2962067,2022-08-09 19:36:57,2022-08-09 22:36:57,5,0.0,61.0,{3A8B5D8E-B2D9-4A51-838E-AEEC0131EEFA},True,...,0.0,,,2022-08-26,1.0,60.0,,,,
248516,2022-08-10,48E98EA5-B4A4-41FA-903B-AEEC016F77B2,2962499,2022-08-09 19:40:01,2022-08-09 22:40:01,72,0.0,139.0,{1FDCB322-1FF6-4437-BEB7-AD6B00AB4563},True,...,0.0,,,2022-09-08,1.0,60.0,,,,
248487,2022-08-10,87717F19-9375-4793-8D46-AEEC01239B6D,2961934,2022-08-09 19:40:19,2022-08-09 22:40:19,0,0.0,16.0,{774AED01-A919-4EE6-96C6-A82E00E77A49},False,...,0.0,,,2022-09-12,1.0,24.0,,,,
248517,2022-08-10,FEE0B41E-8B20-472B-9116-AEEC017B9CBD,2962534,2022-08-09 20:12:12,2022-08-09 23:12:12,33,0.0,7.0,{EDE9E436-815F-4A38-A852-AA3E00AA5BBE},True,...,0.0,,,2022-09-12,1.0,60.0,,,,
248483,2022-08-10,6F9BB382-3E23-47B6-A6E7-AEEC011F41B3,2961885,2022-08-09 20:57:14,2022-08-09 23:57:14,94,0.0,5.0,{F3FD2C55-B407-4013-BD4E-AE5A0144B405},True,...,0.0,,,2022-08-26,1.0,60.0,,,,


The data contains various numerical and categorical columns. To fully understand the data and proceed with cleaning the dataset, one must work through the columns one by one. Visual inspection is very helpful when working with this dataset. It is also necessary to statistically describe different columns as it helps to get a better understanding of the data. Below are some examples on how some of the numerical and categorical columns were analyzed.

Another helpful resource is the overview of the codes and terms that are used in the dataset, it can be found on the [public-reports](https://www.bondora.com/en/public-reports) section on Bondora's website.

In [8]:
# Describing a numerical column.
data.Amount.describe()

count    249251.000000
mean       2590.089617
std        2145.767989
min           6.390000
25%         744.000000
50%        2125.000000
75%        4150.000000
max       10632.000000
Name: Amount, dtype: float64

In [9]:
# Numerical column value counts.
data.Amount.value_counts(dropna=False)

4150.0    24300
530.0     20337
531.0     16757
2126.0     9971
2125.0     9046
          ...  
2773.0        1
3596.0        1
2986.0        1
5231.0        1
953.7         1
Name: Amount, Length: 7177, dtype: int64

In [10]:
# Describing a categorical column.
data.Education.describe()

count    249201.000000
mean          3.499220
std           1.300371
min          -1.000000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: Education, dtype: float64

In [11]:
# Categorical column value counts.
data.Education.value_counts(dropna=False)

 4.0    84354
 3.0    67685
 5.0    58226
 1.0    30002
 2.0     6444
-1.0     2482
 NaN       50
 0.0        8
Name: Education, dtype: int64

## Cleaning and filtering the data

The first step is to drop the unnecessary columns. Many of these columns are obsolete (see Bondora API docs) or serve no purpose for us. However some of the columns are not marked as obsolete in the API docs and half-way through the dataset they are filled only with null values. The reason behind the null values is most likely EU's data protection law (GDPR). Most of these columns were detected by visually going through the dataset and since half of the data is missing in these columns, there is really no point in keeping them.

We keep the columns that can be used as inputs for the models and we also keep the columns that will help us with validating the models in the later stages.

In [12]:
# Drop unnecessary columns.
data = data.drop(['BiddingStartedOn', 'LoanApplicationStartedDate', 'ApplicationSignedHour',
                'ApplicationSignedWeekday', 'DateOfBirth', 'County', 'City', 'UseOfLoan',
                'MaritalStatus', 'NrOfDependants', 'EmploymentStatus', 'EmploymentPosition',
                'WorkExperience', 'OccupationArea', 'IncomeFromPrincipalEmployer', 'IncomeFromPension',
                'IncomeFromFamilyAllowance', 'IncomeFromSocialWelfare', 'IncomeFromLeavePay',
                'IncomeFromChildSupport', 'IncomeOther', 'RefinanceLiabilities', 'DebtToIncome', 'FreeCash', 'MonthlyPaymentDay', 'EL_V0', 'Rating_V0',
                'EL_V1', 'Rating_V1', 'Rating_V2', 'PrincipalWriteOffs', 'InterestAndPenaltyWriteOffs', 'PlannedPrincipalTillDate',
                'CreditScoreEsEquifaxRisk', 'CreditScoreEsMicroL', 'BidsPortfolioManager', 'BidsApi', 'BidsManual',
                'PrincipalDebtServicingCost','InterestAndPenaltyDebtServicingCost',
                'ContractEndDate', 'LoanNumber', 'FirstPaymentDate', 'PlannedInterestTillDate', 'CurrentDebtDaysPrimary',
                'DebtOccuredOn', 'CurrentDebtDaysSecondary', 'DebtOccuredOnForSecondary', 'ExpectedLoss', 'LossGivenDefault', 'ExpectedReturn',
                'ProbabilityOfDefault', 'ActiveScheduleFirstPaymentReached', 'PlannedPrincipalPostDefault', 'PlannedInterestPostDefault', 'EAD1', 'EAD2',
                'PrincipalRecovery', 'InterestRecovery', 'RecoveryStage', 'StageActiveSince', 'ModelVersion', 'NextPaymentNr', 'ReScheduledOn', 'PartyId'], axis=1)


Below is a class that processes the data so that it can be used for training the models.

Firstly all the allowed variables are defined with the *assign_variable_types()* method. These variable types are used in order to keep the data our defined scope. For example if for some reason the "Education" field contains a value of "-1", which is not in our defined categories (1,2,3,4,5), then it's assigned as NaN and later filtered out. And additionally some parsing of data types is done to eliminate some of the data errors.

In order to reduce the size of the dataset some filters are applied in the *assign_filters()* method. Some of the columns are parsed into date format, so it's easier to filter the data. Also all the NaN values that exist in the categorical or numerical variables are dropped as the models usually can't handle NaN values.

Third step is to construct the target variable. We construct a target variable called "PreferLoan" which is a boolean value. Loans with previous issues are not to be preferred and are assigned the value "0". Loans without issues are assigned the value "1". 
Since we don't know the outcomes of active loans we don't include them. However there is an exception: if the loan is active but has been restructured or has had some other issues, then we keep it and define the loan as a "bad" loan, since we know for sure that the loan has had some sort of issues.
Another exception is that if the the worst late category is 16-30 days or less and the loan has not had any other issues then the loan is still preferred. 

In [13]:
class DataProcessor:
    CATEGORICAL_VARIABLES = {"Education": [1, 2, 3, 4, 5],
                             "EmploymentDurationCurrentEmployer": ['MoreThan5Years', 'UpTo5Years', 'UpTo4Years', 'UpTo3Years', 'UpTo2Years', 'UpTo1Year', 'TrialPeriod', 'Retiree', 'Other'],
                             "Gender": [0, 1],
                             "HomeOwnershipType": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                             "NewCreditCustomer": [1, 0],
                             "VerificationType": [1, 2, 3, 4],
                             "LoanDuration": [60, 48, 36, 30, 24, 18, 12, 9, 6],
                             "Rating": ['AA', 'A', 'B', 'C', 'D', 'E', 'F', 'HR'],
                             "LanguageCode": [1, 2, 3, 4, 6]}

    EE_VARIABLES = {"CreditScoreEeMini": [600, 700, 800, 900, 1000]}
    FI_VARIABLES = {"CreditScoreFiAsiakasTietoRiskGrade": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

    NUMERIC_VARIABLES = ["Age", "Amount", "LiabilitiesTotal",
                         "IncomeTotal", "MonthlyPayment", "ExistingLiabilities", "NoOfPreviousLoansBeforeLoan", "AmountOfPreviousLoansBeforeLoan", "PreviousRepaymentsBeforeLoan"]

    DATE_VARIABLES = ['LoanDate', 'MaturityDate_Original', 'MaturityDate_Last', 'ReportAsOfEOD', 'LastPaymentOn']

    @classmethod
    def assign_variable_types(self, df, country):
        # Add country specific variables to categorical columns.
        if country == 'EE':
            self.CATEGORICAL_VARIABLES = self.CATEGORICAL_VARIABLES | self.EE_VARIABLES
        elif country == 'FI':
            self.CATEGORICAL_VARIABLES = self.CATEGORICAL_VARIABLES | self.FI_VARIABLES

        # Assign categories for categorical type columns.
        ordered_categorical_keys = sorted(list(self.CATEGORICAL_VARIABLES.keys()))
        df[ordered_categorical_keys] = df[ordered_categorical_keys].astype("float64", errors='ignore')
        df[ordered_categorical_keys] = df[ordered_categorical_keys].apply(self.assign_categories) 

        # Set types for columns.
        df[self.NUMERIC_VARIABLES] = df[self.NUMERIC_VARIABLES].astype("float64")
        df[ordered_categorical_keys] = df[ordered_categorical_keys].astype("category")
        df[self.DATE_VARIABLES] = df[self.DATE_VARIABLES].apply(pd.to_datetime, format='%Y-%m-%d', errors='coerce')

        # Drop all the columns of the categorical and numerical variables that contain NaN values.
        df.dropna(subset=ordered_categorical_keys + self.NUMERIC_VARIABLES, inplace=True)
        
        return df

    @classmethod
    def assign_categories(self, column):
        return column.astype(CategoricalDtype(categories=self.CATEGORICAL_VARIABLES[column.name]))

    @classmethod
    def assign_filters(self, df):
        # Do not include loans that are currently active, and have not been restructured (the original maturity date of the loan has not increased by at least 60 days).
        df = df.loc[~((df.Status == 'Current') & (df.Restructured == 0))]

        # Some loans have been repaid just days after signing a contract so they should be filtered out.
        df = df.loc[~(((df.MaturityDate_Last - df.LoanDate) < pd.to_timedelta("180days")) & (df.Status == 'Repaid'))]

        # Remove the most recent loans because most of them seem to be with the status "Late" or have not been updated.
        df = df.loc[(df.LoanDate <= (df.ReportAsOfEOD - pd.to_timedelta("120days")))]

        # Age must be between 18 and 70.
        df = df.loc[(df.Age >= 18) & (df.Age <= 70)]

        # Monthly payment must be over 0.
        df = df.loc[(df.MonthlyPayment > 0)]

        # Fill empty values with 0.
        df.PreviousRepaymentsBeforeLoan = df.PreviousRepaymentsBeforeLoan.fillna(0)
        df.PreviousEarlyRepaymentsBefoleLoan = df.PreviousEarlyRepaymentsBefoleLoan.fillna(0)
        
        return df

    @classmethod
    def construct_target_value(self, df):
        # Constructing the target value.

        # Loan status must be 'Repaid'.
        # WorseLateCategory must not be higher than 16-30 (can be null).
        # Loan must be repaid before or on the original maturity date.
        # Loan must not be restructured.
        # Loan must not be defaulted.

        # Set the default value for all loans to be 0.
        df["PreferLoan"] = 0
        
        # Select preferred loans and set their value to 1.
        df.loc[(
                    (df.Status == 'Repaid') &
                    (df.WorseLateCategory.isin([np.nan, '1-7', '8-15', '16-30'])) &
                    (df.MaturityDate_Last <= df.MaturityDate_Original) &
                    (df.Restructured != 1) &
                    (df.DefaultDate.isnull())
            ), 'PreferLoan'] = 1
            
        return df

    @classmethod
    def process(self, df, country):
        df = self.assign_variable_types(df, country)
        df = self.assign_filters(df)
        df = self.construct_target_value(df)
        return df
        

In [14]:
df = DataProcessor().process(data, country)

In [15]:
print(f"DataFrame shape after processing: {df.shape}")

DataFrame shape after processing: (72943, 48)


## Preparing for modeling
These are the columns that are used as inputs for the models.

In [17]:
# Columns that are used as inputs in the models.
input_cols = ['NewCreditCustomer', 'VerificationType', 'LanguageCode', 'Age', 'Gender',
                'Amount', 'Interest', 'LoanDuration', 'MonthlyPayment',
                'Education', 'EmploymentDurationCurrentEmployer', 'HomeOwnershipType', 'IncomeTotal',
                'ExistingLiabilities', 'LiabilitiesTotal', 'Rating',
                'NoOfPreviousLoansBeforeLoan', 'AmountOfPreviousLoansBeforeLoan',
                'PreviousRepaymentsBeforeLoan', 'PreferLoan']

# Add special columns based on the country.
if country == 'EE':
    input_cols.append('CreditScoreEeMini')
elif country == 'FI':
    input_cols.append('CreditScoreFiAsiakasTietoRiskGrade')

The dataset is split into a training set and a validation set. The data is also shuffled and stratified by the target value.

In [18]:
# Split and shuffle the datasets.
train_df, val_df = train_test_split(df, test_size=0.4, shuffle=True, stratify=df.PreferLoan)

In [19]:
train_df.shape

(43765, 48)

In [20]:
train_df.PreferLoan.value_counts(dropna=False)

0    33751
1    10014
Name: PreferLoan, dtype: int64

In [21]:
val_df.shape

(29178, 48)

In [22]:
val_df.PreferLoan.value_counts(dropna=False)

0    22502
1     6676
Name: PreferLoan, dtype: int64

Below is a function for transforming data into input and target values. One-hot encoding is also done for categorical variables.

In [23]:
# A transform function for transforming data into valid input.
def transform(df, input_cols):    
    # Filter so that only input columns are used.
    df = df[input_cols]
    if df.isnull().values.any():
        raise Exception("Null value in input")

    # One-hot encoding.
    df = pd.get_dummies(df)
    
    # Separating target value from the dataset.
    X = df.drop('PreferLoan', axis=1)
    y = df['PreferLoan']

    return X, y

## Training the models

We create a dictionary that consists of different models and hyperparameters for those models. After that we use RandomizedSearchCV for finding the optimal hyperparameters. In this instance we prefer it to GridSearchCV because it is a lot faster. However with GridSearchCV it is possible to do a more exhaustive search and therefore potentially get more accurate hyperparameters. There are also various other algorithms that can be used for hyperparameter optimization that could be more optimal than uising RandomizedSearchCV.

We create a dictionary of 3 different models that are commonly used for such classification tasks. Parameters with different possible values are also included.

In [24]:
model_params = {
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'max_depth': [50, 75, 100, 125, 150, 175, 200, None],
            'max_features': ['auto', 'sqrt', 'log2', 1, 2, 3, 4, 5, 6],
            'min_samples_leaf': [1, 2, 3, 4, 8, 10, 12, 15],
            'min_samples_split' : [1, 2, 3, 4, 8, 10, 12, 15],
            'n_estimators': [10, 50, 75, 100, 125, 150, 175, 200],
            'criterion': ['gini', 'entropy']
        }
    },
    'logistic_regression' : {
        'model' : LogisticRegression(),
        'params' : {
            'penalty': ['none', 'l1', 'l2', 'elasticnet'],
            'solver' : ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
            'C' : [0.01, 0.05, 0.1, 0.25, 0.5, 0,75, 1, 1.25, 2, 5, 10],
            'max_iter' : [10, 50, 75, 100, 150, 200, 250, 300]
        }
    },
    'xgboost' : {
        'model' : xgb.XGBClassifier(),
        'params' : {
            "max_depth": [1, 2, 3, 4, 5, 7],
            "gamma": [0, 0.1, 0.25, 0.5, 1],
            "reg_lambda": [0, 1, 5, 10, 15, 25, 50],
            "subsample": [0.1, 0.2, 0.5, 0.7],
            "colsample_bytree": [0.1, 0.25, 0.5, 0.75, 1],
            'scale_pos_weight' : [1, 3, 5, 10, 15, 20]
        }
    }
}

We create an empty list "scores" in which we append the training results and best parameters. We also use three different scoring functions to train the models as it is interesting to see how models with different scoring functions perform in the validation step. After that models with the best parameters are saved so they can be used in the validation step.

In [25]:
# List for keeping track of training results.
scores = []

# List of scoring functions.
score_func = ['precision', 'f1', 'roc_auc']

The data is transformed and then used for training the models.

In [None]:
X_train, y_train = transform(train_df, input_cols)

for s in score_func:
    for m_name, m_params in model_params.items():
        model_name = m_name + "_" + s

        clf = RandomizedSearchCV(m_params['model'], m_params['params'], cv = 3, return_train_score = False, scoring = s, n_iter=25, n_jobs=-1)
        clf.fit(X_train, y_train)
        scores.append({
            'model': model_name,
            'best_score': clf.best_score_,
            
            'best_params': clf.best_params_
        })
        dump(clf, f'models/{model_name}.joblib')

The results can be seen below. The mean cross-validated score of the best model can be seen for each model and scoring function. It is hard to draw any conclusions from these results because each scoring function is calculated differently. And we have yet to seen how these models perform when using the validation set.

In [27]:
pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

Unnamed: 0,model,best_score,best_params
0,random_forest_precision,0.869048,"{'n_estimators': 150, 'min_samples_split': 10,..."
1,logistic_regression_precision,0.643364,"{'solver': 'newton-cg', 'penalty': 'none', 'ma..."
2,xgboost_precision,0.626602,"{'subsample': 0.7, 'scale_pos_weight': 1, 'reg..."
3,random_forest_f1,0.2389,"{'n_estimators': 10, 'min_samples_split': 15, ..."
4,logistic_regression_f1,0.259578,"{'solver': 'newton-cg', 'penalty': 'none', 'ma..."
5,xgboost_f1,0.494391,"{'subsample': 0.7, 'scale_pos_weight': 3, 'reg..."
6,random_forest_roc_auc,0.755397,"{'n_estimators': 200, 'min_samples_split': 2, ..."
7,logistic_regression_roc_auc,0.707747,"{'solver': 'newton-cg', 'penalty': 'none', 'ma..."
8,xgboost_roc_auc,0.738615,"{'subsample': 0.7, 'scale_pos_weight': 1, 'reg..."


## Validation

We begin with the function *calculate_loan_outcomes()*, where the outcome for each loan is calculated. These columns will help us later with evaluating different classification thresholds.

The investment_amount variable is not needed, but I decided to add it because it can be useful for creating more advanced strategies. For example, invest more into loans with higher probability thresholds and less into loans with lower thresholds.

In [28]:
def calculate_loan_outcomes(df, investment_amount = 10):

    # The duration between start of the loan and the last payment. If LastPaymentOn is null then duration is 0.
    df['LoanLength'] = np.where(df.LastPaymentOn.notnull(), (df.LastPaymentOn.sub(df.LoanDate).dt.days.div(365.25)), 0)

    # The total amount repaid by the lender.
    df['TotalRepayments'] = df.PrincipalPaymentsMade + df.InterestAndPenaltyPaymentsMade

    # Calculate the portion size of the investment based on the amount invested.
    df['InvestmentPortionSize'] = investment_amount / df.Amount

    # Calculate returns.
    df['Return'] = df.InvestmentPortionSize * df.TotalRepayments

    # Calculate the profit.
    df['Profit'] = df.Return - investment_amount

    # Calculate the return on investment.
    df['ROI'] = ((df.Return - investment_amount) / investment_amount) * 100

    # Calculate the annual return on investment.
    df['ROI_Annual'] = ((1 + df.ROI / 100) ** (1/df.LoanLength) - 1) * 100

    return df

The *calculate_threshold_stats()* function calculates the statistics for a given threshold. These stats will be used for choosing the best model.

In [29]:
def calculate_threshold_stats(df, total_loans, threshold, filename):
    result = {
        'Model': filename.removesuffix('.joblib'),
        'Threshold': threshold,
        'Total_loans' : total_loans,
        'Investments_made' : df.shape[0],
        'Investments_made_percentage' : df.shape[0] / total_loans * 100,
        'No_of_preferred_loans' : (df.PreferLoan.values == 1).sum(),
        'Precision' : ((df.PreferLoan.values == 1).sum()) / df.shape[0],
        'ROI_annual_mean': df.ROI_Annual.mean()
    }
    return result

The *validate_models()* function is the main function that starts the validation process. It iterates through all of the models and gets the predicted probabilities for each model. 
Different threshold levels are iterated and the DataFrame is filtered with the respective threshold levels, so only the loans where the predicted probability is higher or equal to the threshold are included. The stats are calculated for each threshold level and appended into a list that we use for choosing the most suitable model.

In [30]:
def validate_models(df, total_loans):
    stats = []
    # Transform validation df into valid input for models.
    X_validation, y_validation = transform(df, input_cols)

    for filename in os.listdir('./models'):
        clf = load(f'./models/{filename}')
        predictions = clf.predict_proba(X_validation)
        df["Prediction"] = predictions[:, 1]

        for threshold in np.arange(0.5, 1.0, 0.025):
            df_t = df.loc[(df.Prediction >= threshold)]
            t_stats = calculate_threshold_stats(df_t, total_loans, threshold, filename)
            stats.append(t_stats)

    return stats

## Interpreting the results

In [31]:
import warnings
warnings.filterwarnings("ignore", message="invalid value encountered in long_scalars")

def calculate_results(df, total_loans):
    df = calculate_loan_outcomes(df)
    df = validate_models(df, total_loans)
    return pd.DataFrame(df).set_index("Model").round(3).sort_values(by='ROI_annual_mean', ascending=False)

As can be seen below, the highest annual ROI is over 20%, but we must consider that only 1 investment was made. Overall the number of investments made is quite low. To get a more accurate overview, we should increase the minimum number investments made.

In [32]:
result = calculate_results(val_df, val_df.shape[0])
result[:10]

Unnamed: 0_level_0,Threshold,Total_loans,Investments_made,Investments_made_percentage,No_of_preferred_loans,Precision,ROI_annual_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
xgboost_roc_auc,0.925,29178,1,0.003,1,1.0,20.797
xgboost_f1,0.975,29178,2,0.007,2,1.0,15.443
logistic_regression_roc_auc,0.825,29178,32,0.11,26,0.812,12.38
logistic_regression_f1,0.875,29178,7,0.024,4,0.571,11.745
xgboost_f1,0.95,29178,33,0.113,27,0.818,11.621
logistic_regression_roc_auc,0.85,29178,15,0.051,12,0.8,11.517
xgboost_precision,0.75,29178,121,0.415,103,0.851,10.911
logistic_regression_roc_auc,0.9,29178,6,0.021,4,0.667,10.846
logistic_regression_roc_auc,0.875,29178,9,0.031,6,0.667,10.838
logistic_regression_f1,0.775,29178,62,0.212,50,0.806,10.776


If we set the investments made percentage to at least 1% of total loans, then we get an annual ROI in the range of 8.2 to 9.2%. So in this case if we invest in every ~100th loan from EE, then we should expect an annual ROI in this range. 

In [33]:
result.loc[(result.Investments_made_percentage >= 1)][:10]

Unnamed: 0_level_0,Threshold,Total_loans,Investments_made,Investments_made_percentage,No_of_preferred_loans,Precision,ROI_annual_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
xgboost_precision,0.675,29178,378,1.295,295,0.78,9.249
xgboost_f1,0.9,29178,323,1.107,259,0.802,8.952
xgboost_precision,0.65,29178,489,1.676,375,0.767,8.653
xgboost_roc_auc,0.7,29178,401,1.374,314,0.783,8.538
xgboost_roc_auc,0.725,29178,304,1.042,244,0.803,8.45
xgboost_roc_auc,0.675,29178,531,1.82,405,0.763,8.449
logistic_regression_roc_auc,0.7,29178,389,1.333,294,0.756,8.371
xgboost_roc_auc,0.65,29178,669,2.293,496,0.741,8.304
xgboost_f1,0.875,29178,566,1.94,443,0.783,8.294
random_forest_roc_auc,0.625,29178,354,1.213,291,0.822,8.218


We classified all loans that have had problems as "not preferable". But there is a possibility that some of the loans recover and payments resume. This means that these loans can still be active and we have no way of knowing the final outcome of said loans. The loans which have recovered and are are still active also have an effect on the annual mean ROI. If we eliminate the loans that still have the status "Current", we should get more accurate results.

With the exclusion of currently active loans the annual mean ROI also increases by a percent or two. It should also be noted that we did not exclude loans with the status "Late", which might also have a chance to recover and therefore further increase the annual ROI.

It is good to see that we can constantly reach at least 11% mean annual ROI. When choosing which model to use there are multiple things to consider. There should be a good balance between precision and the number of investments made. If the precision is really high, but the number of investments is really low, then in practice the model might not be viable, as the preferred loans are very rare.

In [34]:
df_no_current = val_df.loc[(val_df.Status != 'Current')]

result_no_current = calculate_results(df_no_current, val_df.shape[0])
result_no_current.loc[(result_no_current.Investments_made_percentage >= 1)][:10]

Unnamed: 0_level_0,Threshold,Total_loans,Investments_made,Investments_made_percentage,No_of_preferred_loans,Precision,ROI_annual_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
xgboost_f1,0.9,29178,309,1.059,259,0.838,11.255
xgboost_precision,0.675,29178,360,1.234,295,0.819,11.25
logistic_regression_roc_auc,0.7,29178,362,1.241,294,0.812,11.098
random_forest_roc_auc,0.575,29178,574,1.967,481,0.838,11.027
xgboost_precision,0.65,29178,464,1.59,375,0.808,10.942
random_forest_roc_auc,0.625,29178,336,1.152,291,0.866,10.939
random_forest_roc_auc,0.6,29178,445,1.525,377,0.847,10.876
xgboost_roc_auc,0.65,29178,632,2.166,496,0.785,10.85
xgboost_roc_auc,0.625,29178,758,2.598,586,0.773,10.85
xgboost_roc_auc,0.675,29178,508,1.741,405,0.797,10.822


Another way to increase the annual ROI is to only invest into loans which have a higher interest and therefore a better return. If we set the minimum interest to 20% we can see that the ROI also increases by quite a lot. However the trade-off is that the precision is lower and we should expect more defaults. But it seems like the extra defaults don't affect the ROI that much and we're still able to achieve a maximum ROI of 15% while investing into ~1% of loans from EE.

In [40]:
df_interest = val_df.loc[(val_df.Status != 'Current') & (val_df.Interest >= 20)]

result_interest = calculate_results(df_interest, val_df.shape[0])
result_interest.loc[(result_interest.Investments_made_percentage >= 1)][:10]

Unnamed: 0_level_0,Threshold,Total_loans,Investments_made,Investments_made_percentage,No_of_preferred_loans,Precision,ROI_annual_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
xgboost_f1,0.85,29178,306,1.049,239,0.781,15.002
xgboost_roc_auc,0.625,29178,312,1.069,232,0.744,14.783
xgboost_f1,0.825,29178,436,1.494,317,0.727,14.177
xgboost_f1,0.8,29178,593,2.032,421,0.71,14.039
random_forest_roc_auc,0.525,29178,369,1.265,281,0.762,13.945
random_forest_roc_auc,0.5,29178,466,1.597,347,0.745,13.852
xgboost_roc_auc,0.55,29178,541,1.854,387,0.715,13.596
xgboost_f1,0.775,29178,725,2.485,494,0.681,13.561
xgboost_roc_auc,0.5,29178,742,2.543,514,0.693,13.485
random_forest_f1,0.55,29178,348,1.193,255,0.733,13.445
