In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD

In [4]:
class Preprocessing():
    def __init__(self, filepath, columns_to_drop, target_column, test_size):
        self.data = None
        self.filepath = filepath
        self.col_to_drop = columns_to_drop
        self.trgt_col = str(target_column)
        self.test_size = test_size
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
    def __read_file(self):
        df = pd.read_csv(self.filepath)
        return df
    
    def __fill_missing_values(self):
        null_cols = self.data.columns[self.data.isna().any()].tolist()
        for item in null_cols:
            if ((self.data[item].nunique() == 2) or (self.data[item].dtypes == object)):
                # taking only binary value columns and replacing the missing values with mode
                self.data[item].fillna(self.data[item].mode()[0],inplace=True)
            elif self.data[item].dtype in (['int64', 'float64']):
                self.data[item].fillna(self.data[item].median(),inplace=True)
            else:
                pass
            
    def __dependent_independent_split(self):
        self.data = self.data.drop(self.col_to_drop, axis=1)
        X = self.data.drop(self.trgt_col, axis=1)
        y = self.data[self.trgt_col]
        return X, y
    
    def __label_encode(self, y):
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)
        return y
        
    def __one_hot_encode(self, x):
        x = pd.get_dummies(x, drop_first=True)
        return x
        
    def __train_test_split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=0, stratify=y)
        return X_train, X_test, y_train, y_test
    
    def __over_sampling(self):
        over_sample = SMOTE(sampling_strategy='auto')
        self.X_train, self.y_train = over_sample.fit_resample(self.X_train, self.y_train)
        
    def __under_sampling(self):
        under_sample = RandomUnderSampler()
        self.X_train, self.y_train = under_sample.fit_resample(self.X_train, self.y_train)
        
    def scaling(self):
        scale = StandardScaler()
        self.X_train = scale.fit_transform(self.X_train)
        self.X_test = scale.transform(self.X_test)
    
    def data_processing_call(self):
        self.data = self.__read_file()
        self.__fill_missing_values()
        X, y = self.__dependent_independent_split()
        y = self.__label_encode(y)
        X = self.__one_hot_encode(X)
        X.sort_index(axis=1, inplace=True)
        self.X_train, self.X_test, self.y_train, self.y_test = self.__train_test_split(X, y)
        self.scaling()
        self.__over_sampling()
        
    

In [5]:
class Models(Preprocessing):
    def __init__(self, filepath, columns_to_drop, target_column, test_size, model_list, hyperparameters=False):
        super().__init__(filepath, columns_to_drop, target_column, test_size)
        self.params = {}
        self.classifiers = model_list
        self.hyperparameters = hyperparameters
        
    def data_processing(self):
        self.data_processing_call()
        
    def lr_model(self):
        self.params["LR"] = {}
        model = LogisticRegression()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
    
    def rf_model(self):
        self.params["RF"] = {}
        model = RandomForestClassifier()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
        
    def xgb_model(self):
        self.params["XGB"] = {}
        model = XGBClassifier()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
        
    def knn_model(self):
        self.params["KNN"] = {}
        model = KNeighborsClassifier()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
        
    def svc_model(self):
        self.params["SVC"] = {}
        model = SVC()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
        
    def nb_model(self):
        self.params["NB"] = {}
        model = GaussianNB()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search        
        
    def ann_model(self):
        self.params["ANN"] = {}

        model = Sequential()
        model.add(Dense(32, input_shape=self.input_shape, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(8, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
        
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train, epochs=self.epochs, 
                      batch_size=self.batch_size, callbacks=self.callback, 
                      validation_data=(self.X_test, self.y_test))
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
    
    def set_dl_parameters(self):
        self.input_shape = (14,)
        self.loss = 'binary_crossentropy'
        opt = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True)
        self.optimizer = opt
        self.epochs = 40
        self.batch_size = 10
        self.checkpoint_filepath = "my_best_model.hdf5"
        model_checkpoint_callback = ModelCheckpoint(filepath=self.checkpoint_filepath,
                                                    save_weights_only=True,
                                                    monitor='val_loss',
                                                    mode='min',
                                                    verbose=1,
                                                    save_best_only=True)
        
        self.callback = [model_checkpoint_callback]
    
    def dl_modelling(self):
        ml_model_scores_df = pd.DataFrame(columns=["Model_Name", "Accuracy", "Recall", "Precision", "F1"])
        scores = {}
        model_obj = {}
        self.set_dl_parameters()
        for idx, model_name in enumerate(self.classifiers):
            if model_name == "ANN":
                model_trained = self.ann_model()
            else:
                continue
                
            model_obj[model_name] = model_trained
            
            model_trained.load_weights(self.checkpoint_filepath)
            # load_model(checkpoint_filepath)
            
            y_pred = (model_trained.predict(self.X_test) > 0.5).astype(int)
            
            accuracy, recall, precision, f1 = self.performance_metrics(self.y_test, y_pred)
            ml_model_scores_df.loc[idx] = [model_name, accuracy, recall, precision, f1]
            
        return ml_model_scores_df, model_obj
    
    def ml_modelling(self):
        ml_model_scores_df = pd.DataFrame(columns=["Model_Name", "Accuracy", "Recall", "Precision", "F1"])
        scores = {}
        model_obj = {}
        for idx, model_name in enumerate(self.classifiers):
            if model_name == "LR":
                model_trained = self.lr_model()
            elif model_name == "RF":
                model_trained = self.rf_model()
            elif model_name == "XGB":
                model_trained = self.xgb_model()
            elif model_name == "KNN":
                model_trained = self.knn_model()
            elif model_name == "SVC":
                model_trained = self.svc_model()
            elif model_name == "NB":
                model_trained = self.nb_model()
            else:
                print(f"Not a valid ml model: {model_name}")
                continue
            
            model_obj[model_name] = model_trained
            
            y_pred = model_trained.predict(self.X_test)
            accuracy, recall, precision, f1 = self.performance_metrics(self.y_test, y_pred)
            ml_model_scores_df.loc[idx] = [model_name, accuracy, recall, precision, f1]
        
        return ml_model_scores_df, model_obj

    def hyperparameter_tuning(self, model):
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        search = RandomizedSearchCV(model, self.params, cv=cv, scoring='accuracy')
        result = search.fit(self.X_train, self.y_train)
        accuracy = result.best_score_
        best_params = result.best_params_
        return result
    
    def performance_metrics(self, y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        return accuracy, recall, precision, f1
    
    def ml_model_save(self, model):
        with (open("best_model.pickle", "wb")) as file:
            pickle.dump(model, file)
            
    def dl_model_save(self, model):
        model.save("best_model.h5")
            
    def save_model_scores(self, model_scores_df):
        model_scores_df.to_excel("All_Model_Performances.xlsx", index=False)
    
    def model_calls(self):
        ml_model_scores_df, model_obj = self.ml_modelling()
        dl_model_scores_df, dl_model_obj = self.dl_modelling()
        
        ml_model_scores_df = ml_model_scores_df.append(dl_model_scores_df, ignore_index=True)
        model_obj.update(dl_model_obj)
        
        ml_model_scores_df = ml_model_scores_df.sort_values(by = ['F1', 'Recall', 'Accuracy', 'Precision'], ascending=False, ignore_index=True)
        best_model_name = ml_model_scores_df["Model_Name"][0]
        best_model = model_obj[best_model_name]
        print(ml_model_scores_df)
        if (best_model_name in list(dl_model_scores_df["Model_Name"])):
            self.dl_model_save(best_model)
        else:
            self.ml_model_save(best_model)
            
        self.save_model_scores(ml_model_scores_df)
        
    

In [6]:
filepath = "dataset/train.csv"
columns_to_drop = ["Loan_ID"]
target_column = "Loan_Status"
test_size = 0.3
model_list = ["LR", "RF", "XGB", "KNN", "SVC", "NB", "ANN"]

In [7]:
# scaling and sampling both done
processing_obj = Models(filepath, columns_to_drop, target_column, test_size, model_list)
processing_obj.data_processing()
processing_obj.model_calls()




Not a valid ml model: ANN
Epoch 1/40

Epoch 00001: val_loss improved from inf to 0.55010, saving model to my_best_model.hdf5
Epoch 2/40

Epoch 00002: val_loss improved from 0.55010 to 0.49340, saving model to my_best_model.hdf5
Epoch 3/40

Epoch 00003: val_loss improved from 0.49340 to 0.44975, saving model to my_best_model.hdf5
Epoch 4/40

Epoch 00004: val_loss did not improve from 0.44975
Epoch 5/40

Epoch 00005: val_loss did not improve from 0.44975
Epoch 6/40

Epoch 00006: val_loss did not improve from 0.44975
Epoch 7/40

Epoch 00007: val_loss did not improve from 0.44975
Epoch 8/40

Epoch 00008: val_loss did not improve from 0.44975
Epoch 9/40

Epoch 00009: val_loss did not improve from 0.44975
Epoch 10/40

Epoch 00010: val_loss did not improve from 0.44975
Epoch 11/40

Epoch 00011: val_loss did not improve from 0.44975
Epoch 12/40

Epoch 00012: val_loss did not improve from 0.44975
Epoch 13/40

Epoch 00013: val_loss did not improve from 0.44975
Epoch 14/40

Epoch 00014: val_loss 