In [5]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import ModelCheckpoint

In [35]:
class Preprocessing():
    def __init__(self, filepath, columns_to_drop, target_column, test_size):
        self.data = None
        self.filepath = filepath
        self.col_to_drop = columns_to_drop
        self.trgt_col = str(target_column)
        self.test_size = test_size
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
    def __read_file(self):
        df = pd.read_csv(self.filepath)
        return df
    
    def __fill_missing_values(self):
        null_cols = self.data.columns[self.data.isna().any()].tolist()
        for item in null_cols:
            if ((self.data[item].nunique() < 10) or (self.data[item].dtypes == object)):
                # taking only binary value columns and replacing the missing values with mode
                self.data[item].fillna(self.data[item].mode()[0],inplace=True)
            elif self.data[item].dtype in (['int64', 'float64']):
                self.data[item].fillna(self.data[item].mean(),inplace=True)
            else:
                pass
            
    def __dependent_independent_split(self):
        self.data = self.data.drop(self.col_to_drop, axis=1)
        X = self.data.drop(self.trgt_col, axis=1)
        y = self.data[self.trgt_col]
        return X, y
    
    def __label_encode(self, y):
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)
        return y
        
    def __one_hot_encode(self, x):
        x = pd.get_dummies(x, drop_first=True)
        return x
        
    def __train_test_split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=0)
        return X_train, X_test, y_train, y_test
        
    def standard_scaling(self):
        scale = StandardScaler()
        self.X_train = scale.fit_transform(self.X_train)
        self.X_test = scale.transform(self.X_test)
        
    def min_max_scaling(self):
        scale = MinMaxScaler()
        self.X_train = scale.fit_transform(self.X_train)
        self.X_test = scale.transform(self.X_test)
        
        
    def robust_scaling(self):
        scale = RobustScaler(quantile_range=(25.0, 75.0))
        self.X_train = scale.fit_transform(self.X_train)
        self.X_test = scale.transform(self.X_test)
    
    def data_processing_call(self):
        self.data = self.__read_file()
        self.__fill_missing_values()
        X, y = self.__dependent_independent_split()
        X = self.__one_hot_encode(X)
        X.sort_index(axis=1, inplace=True)
        self.X_train, self.X_test, self.y_train, self.y_test = self.__train_test_split(X, y)
        self.min_max_scaling()
        

In [45]:
class Models(Preprocessing):
    def __init__(self, filepath, columns_to_drop, target_column, test_size, model_list, hyperparameters=False):
        super().__init__(filepath, columns_to_drop, target_column, test_size)
        self.params = {}
        self.classifiers = model_list
        self.hyperparameters = hyperparameters
        
    def data_processing(self):
        self.data_processing_call()
        
    def lr_model(self):
        self.params["LR"] = {}
        model = LinearRegression()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
        
    def knn_model(self):
        self.params["KNN"] = {}
        model = KNeighborsRegressor()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
    
    def rf_model(self):
        self.params["RF"] = {}
        model = RandomForestRegressor()
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train)
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
        
    def ann_model(self):
        self.params["ANN"] = {}

        model = Sequential()
        model.add(Dense(32, input_shape=self.input_shape, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(8, activation='relu'))
        model.add(Dense(1))
        
        model.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
        
        if not self.hyperparameters:
            model.fit(self.X_train, self.y_train, epochs=self.epochs, 
                      batch_size=self.batch_size, callbacks=self.callback, 
                      validation_data=(self.X_test, self.y_test))
            return model
        else:
            model_search = self.hyperparameter_tuning(model)
            return model_search
    
      
    def set_dl_parameters(self):
        self.input_shape = (12,)
        self.loss = 'mean_squared_error'
        opt = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True)
        self.optimizer = "adam"
        self.epochs = 40
        self.batch_size = 10
        self.checkpoint_filepath = "my_best_model.hdf5"
        model_checkpoint_callback = ModelCheckpoint(filepath=self.checkpoint_filepath,
                                                    save_weights_only=True,
                                                    monitor='val_loss',
                                                    mode='min',
                                                    verbose=1,
                                                    save_best_only=True)
        
        self.callback = [model_checkpoint_callback]
    
    def dl_modelling(self):
        ml_model_scores_df = pd.DataFrame(columns=["Model_Name", "MAE", "RMSE", "R2"])
        scores = {}
        model_obj = {}
        self.set_dl_parameters()
        for idx, model_name in enumerate(self.classifiers):
            if model_name == "ANN":
                model_trained = self.ann_model()
            else:
                continue
                
            model_obj[model_name] = model_trained
            
            model_trained.load_weights(self.checkpoint_filepath)
            # load_model(checkpoint_filepath)
            
            y_pred = model_trained.predict(self.X_test)
            
            mae, rmse, r2 = self.performance_metrics(self.y_test, y_pred)
            ml_model_scores_df.loc[idx] = [model_name, mae, rmse, r2]
            
        return ml_model_scores_df, model_obj
    
    def ml_modelling(self):
        ml_model_scores_df = pd.DataFrame(columns=["Model_Name", "MAE", "RMSE", "R2"])
        scores = {}
        model_obj = {}
        for idx, model_name in enumerate(self.classifiers):
            if model_name == "LR":
                model_trained = self.lr_model()
            elif model_name == "KNN":
                model_trained = self.knn_model()
            elif model_name == "RF":
                model_trained = self.rf_model()
            else:
                print(f"Not a valid ml model: {model_name}")
                continue
            
            model_obj[model_name] = model_trained
            
            y_pred = model_trained.predict(self.X_test)
            mae, rmse, r2 = self.performance_metrics(self.y_test, y_pred)
            ml_model_scores_df.loc[idx] = [model_name, mae, rmse, r2]
        
        return ml_model_scores_df, model_obj

    def hyperparameter_tuning(self, model):
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        search = RandomizedSearchCV(model, self.params, cv=cv, scoring='accuracy')
        result = search.fit(self.X_train, self.y_train)
        accuracy = result.best_score_
        best_params = result.best_params_
        return result
    
    def performance_metrics(self, y_true, y_pred):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)
        return mae, rmse, r2
    
    def ml_model_save(self, model):
        with (open("best_model.pickle", "wb")) as file:
            pickle.dump(model, file)
            
    def dl_model_save(self, model):
        model.save("best_model.h5")
            
    def save_model_scores(self, model_scores_df):
        model_scores_df.to_excel("All_Model_Performances.xlsx", index=False)
    
    def model_calls(self):
        ml_model_scores_df, model_obj = self.ml_modelling()
        dl_model_scores_df, dl_model_obj = self.dl_modelling()
        
        ml_model_scores_df = ml_model_scores_df.append(dl_model_scores_df, ignore_index=True)
        model_obj.update(dl_model_obj)
        
        ml_model_scores_df = ml_model_scores_df.sort_values(by = ["MAE", "RMSE", "R2"], ignore_index=True)
        best_model_name = ml_model_scores_df["Model_Name"][0]
        best_model = model_obj[best_model_name]
        print(ml_model_scores_df)
        if (best_model_name in list(dl_model_scores_df["Model_Name"])):
            self.dl_model_save(best_model)
        else:
            self.ml_model_save(best_model)
            
        self.save_model_scores(ml_model_scores_df)

In [46]:
filepath = "dataset/boston.csv"
columns_to_drop = []
target_column = "MEDV"
test_size = 0.3
model_list = ["LR", "KNN", "RF", "ANN"]

In [48]:
processing_obj = Models(filepath, columns_to_drop, target_column, test_size, model_list)
processing_obj.data_processing()
processing_obj.model_calls()

Not a valid ml model: ANN
Epoch 1/40

Epoch 00001: val_loss improved from inf to 548.59637, saving model to my_best_model.hdf5
Epoch 2/40

Epoch 00002: val_loss improved from 548.59637 to 489.75674, saving model to my_best_model.hdf5
Epoch 3/40

Epoch 00003: val_loss improved from 489.75674 to 324.67755, saving model to my_best_model.hdf5
Epoch 4/40

Epoch 00004: val_loss improved from 324.67755 to 160.41087, saving model to my_best_model.hdf5
Epoch 5/40

Epoch 00005: val_loss improved from 160.41087 to 137.52812, saving model to my_best_model.hdf5
Epoch 6/40

Epoch 00006: val_loss improved from 137.52812 to 116.75148, saving model to my_best_model.hdf5
Epoch 7/40

Epoch 00007: val_loss improved from 116.75148 to 99.77189, saving model to my_best_model.hdf5
Epoch 8/40

Epoch 00008: val_loss improved from 99.77189 to 84.14669, saving model to my_best_model.hdf5
Epoch 9/40

Epoch 00009: val_loss improved from 84.14669 to 73.17559, saving model to my_best_model.hdf5
Epoch 10/40

Epoch 000


Epoch 00034: val_loss did not improve from 29.10553
Epoch 35/40

Epoch 00035: val_loss improved from 29.10553 to 28.87598, saving model to my_best_model.hdf5
Epoch 36/40

Epoch 00036: val_loss improved from 28.87598 to 28.66074, saving model to my_best_model.hdf5
Epoch 37/40

Epoch 00037: val_loss improved from 28.66074 to 28.14260, saving model to my_best_model.hdf5
Epoch 38/40

Epoch 00038: val_loss improved from 28.14260 to 27.65762, saving model to my_best_model.hdf5
Epoch 39/40

Epoch 00039: val_loss improved from 27.65762 to 27.54874, saving model to my_best_model.hdf5
Epoch 40/40

Epoch 00040: val_loss improved from 27.54874 to 27.27873, saving model to my_best_model.hdf5
  Model_Name       MAE      RMSE        R2
0         RF  2.572336  3.940197  0.813546
1        ANN  3.603087  5.222905  0.672389
2        KNN  3.643026  5.774754  0.599501
3         LR  3.703292  5.305355  0.661963
