In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor

from sklearn.linear_model import LinearRegression,ElasticNet,Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold,RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



In [None]:
import logging

# Create a custom logger
logger = logging.getLogger(__name__)

# Create handlers
f_handler = logging.FileHandler('ModelTraining.log')

# Create formatters and add it to handlers
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
f_handler.setFormatter(f_format)
# Set level of logging
logger.setLevel(logging.INFO)

# Add handlers to the logger
logger.addHandler(f_handler)

In [None]:
class FeatureEngineering:
    '''
    class FeatureEngineering walk on the row data to re arrange, handle missing data 
    and deal with outliers 
    '''
    @staticmethod
    def rearrange_features(df):
        '''
        this function to re format data 
        some values are not acceptable
        some work of this function done in the data collector also, 
        when re formatiing number of bedrooms, baths, and re arrange amenities columns
        '''
        #drop all columns contain Unnamed in thier names
        df = df.iloc[:, ~df.columns.str.contains('^Unnamed')]
        # update the values in series of house price in df by replacing K
        df.update(df[df['house_price'].str.contains('K')].iloc[:,0].str.replace('K','000'))
        # after arranging amenities as columns, drop am column which contanins all amenities 
        df = df.drop(columns=['am'])
        # change the type of values in the df
        df=df.astype('int')
        logger.info('data has been arranged in suitable format')
        return df

    
    @staticmethod
    def na_percentage_in_rows(df):
        '''
        This function search for missing values in Rows, 
        return a stat of all rows with missing values, 
        each row with its missing values percentage.  
        '''
        # get all index of rows that contain NaNs 
        is_NaN = df. isnull()
        row_has_NaN = is_NaN. any(axis=1)
        rows_with_NaN = df[row_has_NaN]

        # create stat with rows index and its NaN value percentage
        stat = pd.DataFrame()
        stat['row'] = df[row_has_NaN].index
        stat['na percentage'] = ((36-df[row_has_NaN].apply(lambda x: x.count(), axis=1))/36).tolist()
        # return all index has a percentage of missing more than or equal 0.5 
        logger.info('Index of rows which have missings values : {}'.format(df[row_has_NaN].index))
        return stat,stat[stat['na percentage']>=0.5].iloc[:,0]
    
    @staticmethod
    def na_percentage_in_cols(df):
        '''
        This function search for missing values in features, 
        return a stat of all columns with missing values, 
        each column with its missing values percentage. 
        '''
        # create stat with columns and its NaN value percentage
        stat =pd.DataFrame()
        stat['col'] = df.columns
        stat['na percentage']=df.isna().mean().tolist()
        logger.info('features which have missings values : {}'.format(stat[stat['na percentage']>=0.5].iloc[:,0]))
        # return all columns name has a percentage of missing more than or equal 0.5 
        return stat,stat[stat['na percentage']>=0.5].iloc[:,0]

    @staticmethod
    def handle_missings(df):
        '''
        This function check the percentage of NaN values in rows and columns 
        and deside wether to drop them or not
        '''
        s1,index=FeatureEngineering.na_percentage_in_rows(df)
        s2,features=FeatureEngineering.na_percentage_in_cols(df)
        print(features)
        # drop all index has a percentage of missing more than or equal 0.5
        df = df.drop(index)
        if features.size!=0:
            df = df.drop(columns = [features])
        logger.info('Missing data has been handled in dataframe')
        return df



    def drop_outliers(df,data_series):
        '''
        This function deals with outliers 
        this function will get the data and get rid of all outliers using IQR analysis
        '''
        # calculate Q1,Q3
        Q1, Q3 = df[data_series].quantile([0.25, 0.75]).values
        # IQR value
        IQR = Q3 - Q1
        # find limits of the data  
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR

        # finding index of all data outside the limits
        s=df[data_series][( df[data_series] < lower_limit) |
                          ( df[data_series] > upper_limit) ]
        s.index
        # drop data outside the limits
        df = df.drop(s.index)
        logger.info('Outliers Treatment')
        return df


In [None]:
class FeatureSelection:
    '''
    Class FeatureSelection get important features that has a high correlation 
    with target variable.
    '''
    @staticmethod
    def select_features(df, target_variable):
        '''
        This function selects important features using SelectFromModel library 
        '''
        # split data into train and test
        x = df.drop(target_variable, 1)
        y = df[target_variable]
        x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=37)
        
        # random forest calssifier modelto use it in feature selection
        model = rfc(n_estimators = 300, n_jobs = -1,random_state =37, min_samples_leaf = 50)
        
        # select from model with threashold 0.02, so select features with correlation more than 0.02 
        sfm = SelectFromModel(model,threshold=0.02)
        sfm.fit(x_train, y_train)
        # get columns name of important features
        selected_features = x_train.columns[(sfm.get_support())]
        
        # Creating a bar plot to show the selected features correlation and importance
        font = {'size'   : 7}
        matplotlib.rc('font', **font)
        model.fit(x_train, y_train)
        feature_imp = pd.Series(model.feature_importances_,index=x.columns.values).sort_values(ascending=False)
        sns.barplot(x=feature_imp, y=feature_imp.index)
        # Add labels to your graph
        plt.xlabel('Feature Importance Score')
        plt.ylabel('Features')
        plt.title("Visualizing Important Features")
        plt.legend()
        plt.show()

        logger.info('the important features have been selected as : {}'.format(selected_features))
        # return the name of the important features
        return selected_features
    

In [None]:
# DataFrame of dataset
df = pd.read_excel('nadataset.xlsx')
# check for missing values in features
# check information about df
df.info()

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.sample(10)

In [None]:
# check the sum of NaN values
df.isna().sum()

In [None]:
# check for missing values and the percentage of na in columns
feature_stat,features = FeatureEngineering.na_percentage_in_cols(df) 

In [None]:
feature_stat

In [None]:
feature_stat['na percentage'].hist(bins=10, figsize=(4,4))

In [None]:
features

In [None]:
rows_stat,rows = FeatureEngineering.na_percentage_in_rows(df) 

In [None]:
rows_stat

In [None]:
rows

In [None]:
rows_stat['na percentage'].hist(bins=10, figsize=(4,4))

In [None]:
# feature engineering >> handle missing, arranging features, outliers treatment
df=FeatureEngineering.handle_missings(df)
df=FeatureEngineering.rearrange_features(df)

In [None]:
# describe the dataset
df.iloc[:,0:3].describe()

In [None]:
# get a plot for each feature
# shows the values of feature and the coressponding number of observations 
font = {'size'   : 26}
matplotlib.rc('font', **font)
df.hist(bins=10, figsize=(40,40))
plt.savefig('hist.png')

In [None]:
# outliers detection
font = {'size'   : 14}

matplotlib.rc('font', **font)
fig, axs = plt.subplots(1,3, figsize = (12,3))
plt1 = sns.boxplot(df['house_price'], ax = axs[0])
plt2 = sns.boxplot(df['bedrooms'], ax = axs[1])
plt3 = sns.boxplot(df['baths'], ax = axs[2])
plt.tight_layout()

In [None]:
# outliers treatment 
df = FeatureEngineering.drop_outliers(df,'house_price')

font = {'size'   : 14}

matplotlib.rc('font', **font)
fig, axs = plt.subplots(1,3, figsize = (12,3))
plt1 = sns.boxplot(df['house_price'], ax = axs[0])
plt2 = sns.boxplot(df['bedrooms'], ax = axs[1])
plt3 = sns.boxplot(df['baths'], ax = axs[2])
plt.tight_layout()

In [None]:
# get a plot for each feature
# shows the values of feature and the coressponding number of observations 
font = {'size'   : 26}
matplotlib.rc('font', **font)
df.hist(bins=10, figsize=(40,40))
plt.savefig('hist.png')

In [None]:
# check correlation between features as heatmap
font = {'size'   : 10}
matplotlib.rc('font', **font)
plt.figure(figsize = (20, 10))
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu")
plt.show()
plt.savefig('corr1.png')

In [None]:
# correlations to the rental price
corr_matrix = df.corr()

In [None]:
# correlation features to the house price
corr_matrix[corr_matrix.index == 'house_price'].T.sort_values('house_price')

In [None]:
# draw the correlation as a heatmap
f, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr_matrix, square=True)

In [None]:
# draw the correlation more than 0.5 as a heatmap
f, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr_matrix[corr_matrix> 0.5], square=True)

In [None]:
# get all fetures with correlation more than 0.3 with house _price
t = corr_matrix[corr_matrix.index == 'house_price'].T.sort_values('house_price')
t[t['house_price']>=0.3]

In [None]:
t[t['house_price']<=-0.3]

In [None]:
# get the name of features that don't have correlation with house price
t[t['house_price'].isna()].index

In [None]:
# drop column with no correlation 
df1 = df.drop(columns=t[t['house_price'].isna()].index)
plt.figure(figsize = (20, 10))
sns.heatmap(df1.corr(), annot = True, cmap="YlGnBu")
plt.show()
plt.savefig('corr2.png')

In [None]:
sns.distplot(df['house_price'])

In [None]:

fig, axs = plt.subplots(2,2, figsize = (12,7))
plt1 = sns.violinplot(df['bedrooms'], df['house_price'], ax = axs[0,0])
plt2 = sns.violinplot(df['baths'], df['house_price'], ax = axs[1,0])

plt.tight_layout()


#plt.xticks(rotation=45)
plt.title("Violin plot for bedrooms and baths to price")
plt.savefig('violinplot.png')

In [None]:
# get the most important features 
# these features is related to the features correlation table
selected_feat=FeatureSelection.select_features(df,'house_price')

In [None]:
selected_feat

In [None]:
class dataSplitter:
    '''
    Class dataSplitter to split data into train and test using selected features 
    '''
    def __init__(self,df,target_variable,selected_feat):
        #self.x = df.drop(target_variable, 1)
        self.x = df[selected_feat]
        self.y = df[target_variable]
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x,self.y,test_size=0.2,random_state=37)
    
    def scale_features(self):
        '''
        This function scale features usinf standardization 
        to have values of features between 1 and -1  
        '''
        pipeline = Pipeline([
            ('std_scalar', StandardScaler())
        ])

        # standardize x_train, x_test
        self.x_train = pipeline.fit_transform(self.x_train)
        self.x_test = pipeline.transform(self.x_test)

In [None]:
data= dataSplitter(df,'house_price',selected_feat)
data.scale_features()

In [None]:
sns.distplot((data.x_test), bins=50);

In [None]:
class Model:

    '''
    Class Model have all validation and evaluation methods with save method
    '''
    def cross_val(self,model,x,y):
        '''
        This function calculate cross_val_score for model
        '''
        cv = KFold(n_splits=10, random_state=1, shuffle=True)
        # evaluate model
        pred = cross_val_score(model, x, y,cv=cv)
        logger.info('cross_val_score: {}'.format(pred.mean()))
        return pred.mean()



    def print_evaluate(self,true, predicted):  
        '''
        printing evaluation metrics
        '''
        mae, mse, rmse, r2_square=self.evaluate(true, predicted)
        logger.info('MAE: {}'.format(mae))
        logger.info('MSE: {}'.format(mse))
        logger.info('RMSE: {}'.format(rmse))
        logger.info('R2 Square: {}'.format(r2_square))
        logger.info('__________________________________')
        
        print('MAE:', mae)
        print('MSE:', mse)
        print('RMSE:', rmse)
        print('R2 Square', r2_square)
        print('__________________________________')


    def evaluate(self,true, predicted):
        '''
        calculate evaluation metrics for a model
        '''
        mae = mean_absolute_error(true, predicted)
        mse = mean_squared_error(true, predicted)
        rmse = np.sqrt(mean_squared_error(true, predicted))
        r2_square = r2_score(true, predicted)
        return mae, mse, rmse, r2_square   
    
    
    def SaveModel(self,model,filename='finalized_model.sav'):
        # Save the model as a pickle in a file
        # save the model to disk
        pickle.dump(model, open(filename, 'wb'))
        
        # some time later...
        # load the model from disk
    
        #loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
class ModelTraining(Model):
    
    '''
    ModelTraining Class inhiret class Model to have evaluation metrics
    this class have 6 model to train and hyperparameter tuning to get the best prediction model
    '''
    def __init__(self, data, dataTune):
        '''Constructor'''
        # results of all metrics for all models
        self.results_df = pd.DataFrame()
        # data to use in training 
        self.data = data
        # data to use in hyper tuning 
        self.dataTune = dataTune
        
    def linearRegTrain(self):
        ''' to train a linear regression model'''
        logger.info('Training linear regression model')
        # define object of linear regression model
        model = LinearRegression(normalize=False)
        # train the model
        model.fit(self.data.x_train,self.data.y_train)
        # predict 
        test_pred = model.predict(self.data.x_test)
        train_pred = model.predict(self.data.x_train)
        
        # recording metric evaluation 
        print('Test dataset evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_test, test_pred)
        print('Train dataset evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_train, train_pred)
        
        # add all metrics data to a dataframe results_df
        self.results_df = pd.DataFrame(data=[["Linear Regression", *self.evaluate(self.data.y_test, test_pred) , self.cross_val(model,self.data.x_test,self.data.y_test)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
        # save the model 
        self.SaveModel(model,'LinReg.sav')
        return model
    

##################### Random Forest Regressor#########################   
    def RandomForestTrain(self):
        ''' train a Random Forest Regressor model'''
        logger.info('Training Random Forest Regressor model')
        # get the best hypertuned model for Random Forest Regressor
        model = self.HyperTuneRandomForest()
        # train model
        model.fit(self.data.x_train, self.data.y_train)
        # predictions
        test_pred = model.predict(self.data.x_test)
        train_pred = model.predict(self.data.x_train)
        # recording evaluation metrics
        print('Test set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_test, test_pred)
        print('Train set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_train, train_pred)
        # add all metrics data to a dataframe results_df
        results_df_1 = pd.DataFrame(data=[["Random Forest Regressor", *self.evaluate(self.data.y_test, test_pred), self.cross_val(model,self.data.x_test,self.data.y_test)]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
        self.results_df = self.results_df.append(results_df_1, ignore_index=True)
        #save model
        self.SaveModel(model,'RandForest.sav')
        return model
    
    def HyperTuneRandomForest(self):
        '''Random Forest Regressor hyper parameters tuning'''
        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'bootstrap': bootstrap}
        # defin object of RandomForestRegressor to hyper tune the parameter
        rf = RandomForestRegressor()
        # Random search of parameters, using 3 fold cross validation, 
        # search across 100 different combinations, and use all available cores
        rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
        # Fit the random search model
        rf_random.fit(self.dataTune.x_train, self.dataTune.y_train)
        logger.info('best param for: {}'.format(rf_random.best_params_))
        return rf_random.best_estimator_
    
###################### Gradient Boosting Regressor ####################

    def GradientBoostingTrain(self):
        '''Training Gradient Boosting Regressor model'''
        logger.info('Training Gradient Boosting Regressor model')
        # get the best model of Gradient Boosting Regressor after hyper parameters tuning
        model = self.HyperTuneGradientBoosting()
        # train the model
        model.fit(self.data.x_train, self.data.y_train)
        # prediction
        test_pred = model.predict(self.data.x_test)
        train_pred = model.predict(self.data.x_train)
        # recording evaluation metrics
        print('Test dataset evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_test, test_pred)
        print('Train dataset evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_train, train_pred)
        # add all metrics data to a dataframe results_df
        results_df_1 = pd.DataFrame(data=[["Gradient Boosting Regressor", *self.evaluate(self.data.y_test, test_pred), self.cross_val(model,self.data.x_test,self.data.y_test)]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
        self.results_df = self.results_df.append(results_df_1, ignore_index=True)
        # save model
        self.SaveModel(model,'GradBoost.sav')
        return model
    
    def HyperTuneGradientBoosting(self):
        '''Gradient Boosting Regressor hyper parameters tuning'''
        # hyper parameters
        params = {'n_estimators':[500, 1000, 1500, 2000], 'max_depth':[3, 5, 8],'random_state':[22,37,50]}
        # define object of GradientBoostingRegressor model
        gbr = GradientBoostingRegressor()
        # create GridSearchCV object to search for the best estimator
        gbr_grid = GridSearchCV(gbr, params, cv=5)
        gbr_grid.fit(self.dataTune.x_train, self.dataTune.y_train)
        logger.info('best param for: {}'.format(gbr_grid.best_params_))
        return gbr_grid.best_estimator_
    
#################### ElasticNet ####################

    def ElasticNetTrain(self):
        '''Training ElasticNet model'''
        logger.info('Training ElasticNet model')
        # hyperparameters to be tuned
        elastic_params = {'alpha':np.arange(0, 1, 0.01)}
        # hyperparameters tuning using GridSearchCV
        best_estim = GridSearchCV(ElasticNet(), param_grid=elastic_params).fit(self.dataTune.x_train, self.dataTune.y_train).best_estimator_
        # get the best model of ElasticNet after hyperparameters tuning using GridSearchCV
        model = best_estim
        # train the model
        model.fit(self.data.x_train, self.data.y_train)

        test_pred = model.predict(self.data.x_test)
        train_pred = model.predict(self.data.x_train)
        # recording evaluation metrics
        print('Test set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_test, test_pred)
        print('Train set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_train, train_pred)
        # add all metrics data to a dataframe results_df
        results_df_1 = pd.DataFrame(data=[["ElasticNet Regressor", *self.evaluate(self.data.y_test, test_pred), self.cross_val(model,self.data.x_test,self.data.y_test)]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
        self.results_df = self.results_df.append(results_df_1, ignore_index=True)
        #save model
        self.SaveModel(model,'Elastic.sav')
        return model

##################### Lasso ####################
    def LassoTrain(self): 
        '''Training Lasso model'''
        logger.info('Training Lasso model')
        # hyperparameters to be tuned
        lasso_params = {'alpha':np.arange(0, 1, 0.01)}
        # hyperparameters tuning using GridSearchCV
        best_estim = GridSearchCV(Lasso(), param_grid=lasso_params).fit(self.dataTune.x_train, self.dataTune.y_train).best_estimator_
        # get the best model of Lasso after hyperparameters tuning using GridSearchCV
        model = best_estim
        # train the model
        model.fit(self.data.x_train, self.data.y_train)

        test_pred = model.predict(self.data.x_test)
        train_pred = model.predict(self.data.x_train)
        # recording evaluation metrics
        print('Test set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_test, test_pred)
        print('Train set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_train, train_pred)
        # add all metrics data to a dataframe results_df
        results_df_1 = pd.DataFrame(data=[["Lasso Regressor", *self.evaluate(self.data.y_test, test_pred), self.cross_val(model,self.data.x_test,self.data.y_test)]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
        self.results_df = self.results_df.append(results_df_1, ignore_index=True)
        # save model
        self.SaveModel(model,'Lasso.sav')
        return model
    
##################### Ridge ####################
    def RidgeTrain(self):
        '''Training Ridge model'''
        logger.info('Training Ridge model')
        # hyperparameters to be tuned
        ridge_params = {'alpha':[200, 230, 250,265, 270, 275, 290, 300, 500]}
        # hyperparameters tuning using GridSearchCV
        best_estim = GridSearchCV(Ridge(), param_grid=ridge_params).fit(self.dataTune.x_train, self.dataTune.y_train).best_estimator_
        # get the best model of Ridge after hyperparameters tuning using GridSearchCV
        model = best_estim
        # train model
        model.fit(self.data.x_train, self.data.y_train)

        test_pred = model.predict(self.data.x_test)
        train_pred = model.predict(self.data.x_train)
        # recording evaluation metrics
        print('Test set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_test, test_pred)
        print('Train set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_train, train_pred)
        # add all metrics data to a dataframe results_df
        results_df_2 = pd.DataFrame(data=[["Ridge Regression", *self.evaluate(self.data.y_test, test_pred), self.cross_val(model,self.data.x_test,self.data.y_test)]], 
                                    columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
        self.results_df = self.results_df.append(results_df_2, ignore_index=True)
        # save model
        self.SaveModel(model,'Ridge.sav')
        return model

##################### KNN ####################
    def KNNTrain(self):
        '''Training KNeighbors Regressor model'''
        logger.info('Training KNeighbors Regressor model')
        # Define hyperparameters
        hp_params = {'n_neighbors': [100,200,300], 'weights': ['uniform','distance']}
        
        # Search for best hyperparameters and get the best estimator
        knr = GridSearchCV(estimator= KNeighborsRegressor(), param_grid=hp_params, scoring='r2').fit(self.dataTune.x_train, self.dataTune.y_train).best_estimator_
        # get the best model of Ridge after hyperparameters tuning using GridSearchCV
        model = knr
        # ttrain model
        model.fit(self.data.x_train, self.data.y_train)
        
        test_pred = model.predict(self.data.x_test)
        train_pred = model.predict(self.data.x_train)
        # recording evaluation metrics
        print('Test set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_test, test_pred)
        print('Train set evaluation:\n_____________________________________')
        self.print_evaluate(self.data.y_train, train_pred)
        # add all metrics data to a dataframe results_df
        results_df_1 = pd.DataFrame(data=[["KNeighbors Regressor", *self.evaluate(self.data.y_test, test_pred), self.cross_val(model,self.data.x_test,self.data.y_test)]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
        self.results_df = self.results_df.append(results_df_1, ignore_index=True)
        # save model
        self.SaveModel(model,'KNR.sav')
        return model   
#######################################################    
    def predict(self,model,x,y):
        '''function to use trained model to do predictions'''
        pred_y = model.predict(x)
        self.print_evaluate(y,pred_y)
        df = pd.DataFrame()
        df['True'] = y.tolist()
        df['Prediction'] = pred_y.tolist()
        logger.info('Prediction: {}'.format(df))
        return df
        


In [None]:
# prepare data for tuning 
d = pd.read_excel('data.xlsx')
d=d.drop(columns=['Unnamed: 0'])
d=d.astype('int')
d=FeatureEngineering.drop_outliers(d,'house_price')
d=FeatureEngineering.drop_outliers(d,'bedrooms')
d=FeatureEngineering.drop_outliers(d,'baths')
dataTune = dataSplitter(d,'house_price',selected_feat)


In [None]:
# scale features 
dataTune.scale_features()

In [None]:
# prepare data for training 
data = dataSplitter(df,'house_price',selected_feat)
# scale features
data.scale_features()

In [None]:
# define object of ModelTraining
training = ModelTraining(data,dataTune)

In [None]:
# train Linear Regressor
training.linearRegTrain()

In [None]:
# train Gradient Boosting Regressor
training.GradientBoostingTrain()

In [None]:
# train Random Forest Regressor
training.RandomForestTrain()

In [None]:
# train ElasticNet Regressor
training.ElasticNetTrain()

In [None]:
# train Lasso Regressor
training.LassoTrain()

In [None]:
# train Ridge Regressor
training.RidgeTrain()

In [None]:
# train Knieghbors Regressor
training.KNNTrain()

In [None]:
# show the results of metrics for each model #R2 Square
training.results_df.set_index('Model', inplace=True)
training.results_df['R2 Square'].plot(kind='barh', figsize=(12, 8))

In [None]:
training.results_df

In [None]:
# the best model to use 
#training.results_df.sort_values(by=['R2 Square']).iloc[-1,0]

# Making Predictions

In [None]:
# prepare data  
d = pd.read_excel('cleanDataSet.xlsx')
d=d.drop(columns=['Unnamed: 0'])
d=d.astype('int')
d=FeatureEngineering.drop_outliers(d,'house_price')
d=FeatureEngineering.drop_outliers(d,'bedrooms')
d=FeatureEngineering.drop_outliers(d,'baths')
d = d.sample(10)
dat = dataSplitter(d,'house_price',selected_feat)
# scale features 
dat.scale_features()

In [None]:
for model in ["Elastic.sav", "KNR.sav", "Lasso.sav", "LinReg.sav", "RandForest.sav", "Ridge.sav", "GrandBoost.sav" ]:
    print(model)
    loaded_model = pickle.load(open(model, 'rb'))
    print(training.predict(loaded_model,dat.x_test,dat.y_test))
    print('<><><><><><><><><><><><><><><><><><><><><><><><>')
    