In [1]:
import os

os.chdir('../.')

%pwd

'd:\\work\\loan-approval-prediction'

In [2]:
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping


# ML MODELS
def dt_model(X_train, y_train, X_test, y_test, outdir=None):
    dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 123)
    dt.fit(X_train, y_train)
    y_pred_dt = dt.predict(X_test)
    confusion_matrix(y_test, y_pred_dt)
    print('accuracy: {}'.format(round(accuracy_score(y_pred_dt, y_test), 2)))
    print(classification_report(y_test, y_pred_dt))
    filename = os.path.join(outdir, 'dt_model.sav')
    pickle.dump(dt, open(filename, 'wb'))
    return


def rf_model(X_train, y_train, X_test, y_test, outdir=None):
    rfc = RandomForestClassifier(random_state = 445, max_depth = 8, n_estimators = 10, min_samples_split = 2, min_samples_leaf = 6)          
    rfc.fit(X_train, y_train)
    y_pred_rfc = rfc.predict(X_test)
    confusion_matrix(y_test, y_pred_rfc)
    print('accuracy: {}'.format(round(accuracy_score(y_pred_rfc, y_test), 2)))
    print(classification_report(y_test, y_pred_rfc))
    filename = os.path.join(outdir, 'rf_model.sav')
    pickle.dump(rfc, open(filename, 'wb'))
    return


def logistic_model(X_train, y_train, X_test, y_test, outdir=None):
    lr = LogisticRegression(C = 4.49)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    confusion_matrix(y_test, y_pred)
    print('accuracy: {}'.format(round(accuracy_score(y_pred, y_test), 2)))
    print(classification_report(y_test, y_pred))
    filename = os.path.join(outdir, 'lr_model.sav')
    pickle.dump(lr, open(filename, 'wb'))
    return


def svm_model(X_train, y_train, X_test, y_test, outdir=None):
    svm = SVC(C=1, gamma=0.1, kernel='rbf')
    svm.fit(X_train, y_train)
    y_pred_svm = svm.predict(X_test)
    confusion_matrix(y_test, y_pred_svm)
    print('accuracy: {}'.format(round(accuracy_score(y_pred_svm, y_test), 2)))
    print(classification_report(y_test, y_pred_svm))
    filename = os.path.join(outdir, 'svm_model.sav')
    pickle.dump(svm, open(filename, 'wb'))
    return


def knn_model(X_train, y_train, X_test, y_test, outdir=None):
    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 1)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    confusion_matrix(y_test, y_pred_knn)
    print('accuracy: {}'.format(round(accuracy_score(y_pred_knn, y_test), 2)))
    print(classification_report(y_test, y_pred_knn))
    filename = os.path.join(outdir, 'knn_model.sav')
    pickle.dump(knn, open(filename, 'wb'))
    return


def gnb_model(X_train, y_train, X_test, y_test, outdir=None):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred_gnb = gnb.predict(X_test)
    confusion_matrix(y_test, y_pred_gnb)
    print('accuracy: {}'.format(round(accuracy_score(y_pred_gnb, y_test), 2)))
    print(classification_report(y_test, y_pred_gnb))
    filename = os.path.join(outdir, 'gnb_model.sav')
    pickle.dump(gnb, open(filename, 'wb'))
    return


def ann_model(X_train, y_train, X_valid, y_valid, X_test, y_test, outdir=None):
    """binary classification with ANN model"""
    # ann model
    classifier = Sequential()
    # Add the input layer and the first hidden layer
    classifier.add(Dense(units=7, activation = 'relu'))
    classifier.add(Dense(units=7, activation = 'relu'))
    classifier.add(Dense(units=1, activation = 'sigmoid'))
    # optimizer set
    opt = Adam(learning_rate=0.0001)
    classifier.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
    classifier.build(X_train.shape)
    classifier.summary()
    # set callback parameters
    mcp = ModelCheckpoint(filepath=os.path.join(outdir, 'ann_model.h5'), monitor='val_loss', save_best_only=True, mode='min', verbose=1)
    es = EarlyStopping(monitor='val_loss', patience=5)
    callbacks = [es, mcp]
    
    classifier.fit(X_train, y_train, batch_size = 10, validation_data=(X_valid, y_valid), epochs = 500, verbose=1, callbacks=callbacks)
    
    # predict
    y_pred_ann = classifier.predict(X_test)
    y_pred_ann = (y_pred_ann > 0.5)
    confusion_matrix(y_test, y_pred_ann)
    print('accuracy: {}'.format(round(accuracy_score(y_pred_ann, y_test), 2)))
    print(classification_report(y_test, y_pred_ann))
    return


In [3]:
from src.utils import input_dataframe

# read data
in_file = "notebooks/data/input_training_data.csv"

df = input_dataframe(in_file)


In [4]:
from src.utils import feature_engineering

# MODEL BUILDING - 1
#=========================
# TRAIN-TEST SPLIT: CUSTOM-SCALE
#================================
out_model_dir = 'artifacts'
final_columns = ['Married', 'Dependents', 'Education', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']

df_final = df[final_columns]

xysplit = feature_engineering(df_final, colslist=final_columns, return_type='CustomScaler')

X_train = xysplit['X_train']
X_test = xysplit['X_test']
y_train = xysplit['y_train']
y_test = xysplit['y_test']

0 1
1 0.0
2 0
3 2500
4 0.0
5 120.0
6 360.0
7 1.0
8 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.interpolate(method='linear', limit_direction='backward', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.interpolate(method='linear', limit_direction='forward', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cols] = X[cols]/div1


In [6]:
# run model
dt_model(X_train, y_train, X_test, y_test, outdir=out_model_dir)

accuracy: 0.71
              precision    recall  f1-score   support

           0       0.73      0.72      0.73        46
           1       0.68      0.69      0.68        39

    accuracy                           0.71        85
   macro avg       0.70      0.70      0.70        85
weighted avg       0.71      0.71      0.71        85



In [7]:
rf_model(X_train, y_train, X_test, y_test, outdir=out_model_dir)

accuracy: 0.65
              precision    recall  f1-score   support

           0       0.83      0.43      0.57        46
           1       0.57      0.90      0.70        39

    accuracy                           0.65        85
   macro avg       0.70      0.67      0.64        85
weighted avg       0.71      0.65      0.63        85



In [5]:
# MODEL BUILDING - 2
#=========================
# TRAIN-TEST SPLIT: STANDARD-SCALE
#==========================================

xysplit2 = feature_engineering(df_final, colslist=final_columns, return_type='StandardScaler')

X_train2 = xysplit2['X_train']
X_test2 = xysplit2['X_test']
y_train2 = xysplit2['y_train']
y_test2 = xysplit2['y_test']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.interpolate(method='linear', limit_direction='backward', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.interpolate(method='linear', limit_direction='forward', inplace=True)


In [9]:
# run model
logistic_model(X_train2, y_train2, X_test2, y_test2, outdir=out_model_dir)


accuracy: 0.69
              precision    recall  f1-score   support

           0       0.95      0.46      0.62        46
           1       0.60      0.97      0.75        39

    accuracy                           0.69        85
   macro avg       0.78      0.72      0.68        85
weighted avg       0.79      0.69      0.68        85



In [10]:
svm_model(X_train2, y_train2, X_test2, y_test2, outdir=out_model_dir)


accuracy: 0.69
              precision    recall  f1-score   support

           0       0.92      0.48      0.63        46
           1       0.61      0.95      0.74        39

    accuracy                           0.69        85
   macro avg       0.76      0.71      0.68        85
weighted avg       0.77      0.69      0.68        85



In [11]:
knn_model(X_train2, y_train2, X_test2, y_test2, outdir=out_model_dir)


accuracy: 0.65
              precision    recall  f1-score   support

           0       0.77      0.50      0.61        46
           1       0.58      0.82      0.68        39

    accuracy                           0.65        85
   macro avg       0.67      0.66      0.64        85
weighted avg       0.68      0.65      0.64        85



In [12]:
gnb_model(X_train2, y_train2, X_test2, y_test2, outdir=out_model_dir)

accuracy: 0.71
              precision    recall  f1-score   support

           0       0.89      0.52      0.66        46
           1       0.62      0.92      0.74        39

    accuracy                           0.71        85
   macro avg       0.75      0.72      0.70        85
weighted avg       0.77      0.71      0.70        85



In [15]:
from sklearn.model_selection import train_test_split

# ANN Model Training
X_train1, X_valid, y_train1, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=1, shuffle=True)

In [16]:
# run ann model
ann_model(X_train1, y_train1, X_valid, y_valid, X_test, y_test, outdir=out_model_dir)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (235, 7)                  70        
                                                                 
 dense_1 (Dense)             (235, 7)                  56        
                                                                 
 dense_2 (Dense)             (235, 1)                  8         
                                                                 
Total params: 134
Trainable params: 134
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
Epoch 1: val_loss improved from inf to 235.81593, saving model to artifacts\ann_model.h5
Epoch 2/500
 1/24 [>.............................] - ETA: 0s - loss: 7.1029 - accuracy: 0.4000
Epoch 2: val_loss improved from 235.81593 to 226.09846, saving model to artifacts\ann_model.h5
Epoch 3/500
 1/24 [>...........