# IN-STK5000 Project 1  "Credit"

In [1]:
import numpy as np
import pandas

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt


### Define the NameBanker class with functions :

- fit model. First version will use the LogisticRegression
- set interest rate
- predict_probability
- expected_utilty
- get_best_action
 

In [2]:
class NameBanker:
    # 20180913 Harald Grannes
    # Based on skeleton RandomBanker
    # Used in IN-STK500 Project 1
    
    # Fit the model to the data. 
    # First version will use the LogisticRegression in the SciKitLearn package
    def fit(self, X, y):
        
        self.data = [X, y]
        self.model = LogisticRegression().fit(X, y)
        
        return

    # set the interest rate
    def set_interest_rate(self, rate):
        self.rate = rate
        return

    # Predict the probability of creditworthiness for a specific person with data x
    # The function return the probability of a person being creditworthy.
    def predict_proba(self, x):

        predict_pb = self.model.predict_proba(x)[0][0]
        self.prediction = self.model.predict(x)[0]
    
    # Prediction 1 = good
    # Prediction 2 = bad

        if (self.prediction == 1):
            pred_good = predict_pb
            
        elif (self.prediction == 2) :
            pred_good = 1 - predict_pb
        
        else:
            pred_good = 0

        return pred_good
        

    # THe expected utility of granting the loan or not. Here there are two actions:
    # action = 0 do not grant the loan
    # action = 1 grant the loan
    #
    # Make sure that you extract the length_of_loan from the
    # 2nd attribute of x. Then the return if the loan is paid off to you is amount_of_loan*(1 + rate)^length_of_loan
    # The return if the loan is not paid off is -amount_of_loan.
    
    def expected_utility(self, x, action):

        loan_amount = x["amount"]
        length_of_loan = x["duration"]
        utility = 0

    # When action is 0 (declined) there is no loss nor profit. 
    # set utility to zero 
    
        if (action == 0):
            utility = 0

    # Action is 1 (approved) the expected utility is 
    # exptected loss and expected profit
    
        elif (action == 1) :
            
            probability_loss = (1 - self.predict_proba(x))
            probability_profit = 1 - probability_loss

            utility = probability_profit*loan_amount*(pow(1+self.rate, length_of_loan)-1) - probability_loss*loan_amount 
            
        else:
            
            utility = 0

        return utility
                     
    # Return the best action
    # In this case, it will be the maximum of the expected utility from approval (1) and rejection (0) 
    # actions.
    
    def get_best_action(self, x):

        self.utility_deny = self.expected_utility(x, 0)
        self.utility_approve = self.expected_utility(x, 1)
        

        if (self.utility_approve.item() > self.utility_deny) :
            best_action = 1
            
        else:
            best_action = 0

        return best_action


## Create the RandomBanker class

In [3]:
class RandomBanker:
    
    # Fit the model to the data.  You can use any model you like to do
    # the fit, however you should be able to predict all class
    # probabilities
    def fit(self, X, y):
        self.data = [X, y]

    # set the interest rate
    def set_interest_rate(self, rate):
        self.rate = rate
        return

    # Predict the probability of failure for a specific person with data x
    def predict_proba(self, x):
        return 0

    # THe expected utility of granting the loan or not. Here there are two actions:
    # action = 0 do not grant the loan
    # action = 1 grant the loan
    #
    # Make sure that you extract the length_of_loan from the
    # 2nd attribute of x. Then the return if the loan is paid off to you is amount_of_loan*(1 + rate)^length_of_loan
    # The return if the loan is not paid off is -amount_of_loan.
    def expected_utility(self, x, action):
        print("Expected utility: Not implemented")
    # Return the best action. This is normally the one that maximises expected utility.
    # However, you are allowed to deviate from this if you can justify the reason.
    def get_best_action(self, x):
        return np.random.choice(2,1)[0]


## Read datafile and test methods in NameBanker class

In [4]:
## Set up for dataset
features = ['checking account balance', 'duration', 'credit history',
            'purpose', 'amount', 'savings', 'employment', 'installment',
            'marital status', 'other debtors', 'residence time',
            'property', 'age', 'other installments', 'housing', 'credits',
            'job', 'persons', 'phone', 'foreign']
target = 'repaid'


In [5]:
df = pandas.read_csv('c:/git/ml-society-science/data/credit/german.data', sep=' ',
                     names=features+[target])


In [6]:
df.head()

Unnamed: 0,checking account balance,duration,credit history,purpose,amount,savings,employment,installment,marital status,other debtors,...,property,age,other installments,housing,credits,job,persons,phone,foreign,repaid
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [7]:
numerical_features = ['duration', 'age', 'residence time', 'installment', 'amount', 'duration', 'persons', 'credits']

quantitative_features = list(filter(lambda x: x not in numerical_features, features))

X = pandas.get_dummies(df, columns=quantitative_features, drop_first=True)

encoded_features = list(filter(lambda x: x != target, X.columns))

y = df["repaid"]

In [8]:
X.head()

Unnamed: 0,duration,amount,installment,residence time,age,credits,persons,repaid,checking account balance_A12,checking account balance_A13,...,property_A124,other installments_A142,other installments_A143,housing_A152,housing_A153,job_A172,job_A173,job_A174,phone_A192,foreign_A202
0,6,1169,4,4,67,2,1,1,0,0,...,0,0,1,1,0,0,1,0,1,0
1,48,5951,2,2,22,1,1,2,1,0,...,0,0,1,1,0,0,1,0,0,0
2,12,2096,2,3,49,1,2,1,0,0,...,0,0,1,1,0,1,0,0,0,0
3,42,7882,2,4,45,1,2,1,0,0,...,0,0,1,0,1,0,1,0,0,0
4,24,4870,3,4,53,2,2,2,0,0,...,1,0,1,0,1,0,1,0,0,0


In [9]:
y.head()

0    1
1    2
2    1
3    1
4    2
Name: repaid, dtype: int64

#### Split data into training set and test set in 70%/30%

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [11]:
# Create the NameBanker object and fit the model on the training data

nb = NameBanker()
nb.fit(X_train, y_train)



In [12]:
# also set the montly interest rate to 0.005 (0.5%) 

nb.set_interest_rate(0.005)

print(nb.rate)

0.005


In [13]:
print(nb.model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


Predict the classes for the test data set from the hold-out data using the fitted model


In [14]:
y_predict = pandas.DataFrame(nb.model.predict(X_test))
y_probs = nb.model.predict_proba(X_test)


y_predict-y_test = 0 for all.

This must be to good to be true.

In [15]:
sum(y_predict-y_test)

148331

In [16]:
confusion_matrix(y_test, y_predict)

array([[207,   0],
       [  1,  92]], dtype=int64)

Create a test function


In [17]:
# Create the random banker object

rb = RandomBanker()

In [18]:
## Test function

def test_decision_maker(X_test, y_test, interest_rate, decision_maker):
    
    n_test_examples = len(X_test)
    utility = 0
    
    decision_maker.set_interest_rate(interest_rate)

    ## Example test function - this is only an unbiased test if the data has not been seen in training
    for t in range(n_test_examples):
        
        # Cast to DataFrame and transpose to make it compatible with get_best_action
        
        testDf = pandas.DataFrame(X_test.iloc[t]).T
        
        action = decision_maker.get_best_action(testDf)
        
        good_loan = y_test.iloc[t] # assume the labels are correct
        duration = X_test['duration'].iloc[t]
        amount = X_test['amount'].iloc[t]
        
        # If we don't grant the loan then nothing happens
        if (action==0):
            if (good_loan == 1):
                utility -= amount
        else:    
                utility += amount*(pow(1 + interest_rate, duration) - 1)
    return utility


Test the RandomBanker


In [28]:
test_decision_maker(X_test, y_test, 0.005, rb)

-289481.76722888515

Test the NameBanker nb

In [27]:
test_decision_maker(X_test, y_test, 0.005, nb)

69584.06651442831

In [29]:
decision_maker = RandomBanker()

interest_rate = 0.005

n_tests = 100
utility = 0

for iter in range(n_tests):
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)
    
    decision_maker.set_interest_rate(interest_rate)
    #decision_maker.use_calibration(True)
    decision_maker.fit(X_tr, y_tr)
    
    utility += test_decision_maker(X_te, y_te, interest_rate, decision_maker)

print("Random banker utility :", utility)

Random banker utility : -24419269.165331285


In [30]:
decision_maker = NameBanker()

interest_rate = 0.005

n_tests = 100
utility = 0

for iter in range(n_tests):
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)
    
    decision_maker.set_interest_rate(interest_rate)
    #decision_maker.use_calibration(True)
    decision_maker.fit(X_tr, y_tr)
    
    utility += test_decision_maker(X_te, y_te, interest_rate, decision_maker)

print("NameBanker utility: ", utility)

NameBanker utility:  10301750.963411968
