XGBoost model trained on 3 buildings datas only

# 1. Import and Load

In [1]:
import pandas as pd
import numpy as np

# models
from sklearn.ensemble import RandomForestRegressor
# uncomment to install the three models below >>>>>
# !pip3 install catboost
# !pip install lightgbm
# !pip3 install xgboost
# <<<<<<<<<<<<<<<<<<<<<<<<
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# processing
from sklearn.model_selection import train_test_split

# parameters search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# scoring
import math
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
TRAIN_PATH = "../Data/microclimate_model/Combined/three_bldgs_dropped.csv"
TEST_PATH = "../Data/microclimate_model/Combined/three_bldgs_J9_dropped.csv"


# 2. Data Class

In [3]:

"""
This class encapsulates the datas that we will need for training and testing.
It only contains getters for the train and test data
"""
class Data(object):
    def __init__(self, train_path, test_path):
        """
        Parameters:
            train_path (str) : The file path for the trainning csv file. 
            test_path (str) : The path for the test csv file. 
        
        Both train and test datas have 16 columns with buildings already encoded
        """
        
        # - Train and validation data
        self.train_val_df = pd.read_csv(train_path, index_col=0)
        self.X_train_val = self.train_val_df.drop(columns=['CHWTON/SQM'])
        self.y_train_val = self.train_val_df['CHWTON/SQM']  
        
        # - Test data
        self.test_df = pd.read_csv(test_path, index_col=0)
        self.X_test = self.test_df.drop(columns=['CHWTON/SQM'])
        self.y_test = self.test_df['CHWTON/SQM'] 
        
    
    def get_xy_trainval(self):
        """
        Return the X and y for training data which we can split to train and validation data later.
        """
        return self.X_train_val, self.y_train_val
    
    def get_xy_test(self):
        """
        Return the X and y for June 9th test data
        """
        return self.X_test, self.y_test
        

# 3. Train Test Class

In [None]:

"""
This class encapsulates the the training and testing process.
It stores the train and test datas that's already split to X and y
"""
class TrainTest(object):
    def __init__(self, X_train_val, y_train_val, X_test, y_test):
        # - scores_df to display the scores for all our models
        self.columns=['model','r2_val', 'r2_test', 'rmse_test','mbe_test']
        self.scores_df= pd.DataFrame(columns=self.columns)
        
        # - train and test data
        self.X_train_val = X_train_val
        self.y_train_val = y_train_val
        self.X_test = X_test
        self.y_test = y_test
    
    def get_scores_df(self):
        return self.scores_df
    
    
    def train_and_get_score(self, model, model_name):
        """
        This function will use the trainning data to train the model, and get the r2 validation score
        and append a new row to scores_df.
        
        Parameters:
            model (regressor model) : The is a model object that will be trained and used in validation.
                It can be RF, XGB, LGBM, or catboost regressor
                
            model_name (str) : the name of the model displayed in scores_df
        """
        # 1. Train-Val Split
        X_train, X_val, y_train, y_val = train_test_split(self.X_train_val, 
                                                          self.y_train_val, 
                                                          test_size=0.3, 
                                                          random_state=20)

        # 2. fit model that already has parameters
        model.fit(X_train, y_train)


        # - Get best params if it's a random or grid search
        # if("random" in pModelName) or ("grid" in pModelName):
        #     print(pModel.best_estimator_.get_params())
        
        
        # 3. get validation R2 score
        val_r2 = model.score(X_val, y_val)
        
        # 4. store score
        new_row_data = {'model':model_name, "r2_val":val_r2, "r2_test":0, 'rmse_test':0, 'mbe_test':0}
        new_row = pd.DataFrame.from_records([new_row_data])
        self.scores_df = pd.concat([self.scores_df, new_row])

    def get_MBE(self, y_true, y_pred):
        '''
        Parameters:
            y_true (array): Array of observed values
            y_pred (array): Array of prediction values

        Returns:
            mbe (float): Bias score
        '''
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        y_true = y_true.reshape(len(y_true),1)
        y_pred = y_pred.reshape(len(y_pred),1)   
        diff = (y_pred-y_true)
        mbe = diff.mean()
        return mbe

        
    def test_and_get_score(self, model, model_name):
        """
        This function will use the test data and a trained model to compute the y_pred
        and get the mbe, r2, and rmse result and insert it to scores_df
        
        Parameters:
            model (regressor model): The model that has been trained and will be used to predict y using the test data
                It can be RF, XGB, LGBM, or catboost regressor
            model_name (string): the name of the model displayed in scores_df
        
        """
        # 1. Get prediction for the test data
        y_pred = model.predict(self.X_test)
        
        # 2. get the three scores
        r2 = r2_score(self.y_test, y_pred)
        rmse = math.sqrt(mean_squared_error(self.y_test, y_pred))
        mbe = self.get_MBE(self.y_test, y_pred)
        
        # 3. update scores_df with the 3 scores above
        row_to_update = self.scores_df["model"] == model_name
        col_to_update = ['r2_test','rmse_test', 'mbe_test']
        self.scores_df.loc[row_to_update, col_to_update] = [r2, rmse, mbe]
        
        # print("r2:", r2)
        # print("rmse:", rmse)
        # print("mbe:", mbe)

In [None]:
def main():
    # get datas train_val and test
    data_obj = Data(TRAIN_PATH, TEST_PATH)
    X_train_val, y_train_val = data_obj.get_xy_trainval()
    X_test, y_test = data_obj.get_xy_test()
    tt = TrainTest(X_train_val, y_train_val,X_test, y_test)
    
    # RF
    rf_base_name = "RF_base"
    rf_base = RandomForestRegressor(n_estimators = 100, random_state = 42)
    tt.train_and_get_score(rf_base, rf_base_name)
    tt.test_and_get_score(rf_base, rf_base_name)

    # XGB
    xgb_base_name = "XGB_base"
    xgb_base = XGBRegressor(n_estimators = 100, random_state = 42)
    tt.train_and_get_score(xgb_base, xgb_base_name)
    tt.test_and_get_score(xgb_base, xgb_base_name)
    
    
    # LGBM
    lgbm_base_name = "LGBM_base"
    lgbm_base = LGBMRegressor(random_state = 42)
    tt.train_and_get_score(lgbm_base, lgbm_base_name)
    tt.test_and_get_score(lgbm_base, lgbm_base_name)
    
    
    # Catboost
    catboost_base_name = "catboost_base"
    catboost_base = CatBoostRegressor(random_state = 42, verbose=False)
    tt.train_and_get_score(catboost_base, catboost_base_name)
    tt.test_and_get_score(catboost_base, catboost_base_name)
    # print(catboost_base.get_all_params())
    
    
    # display scores_df
    scores_df = tt.get_scores_df()
    print(scores_df)

In [None]:
if __name__ == "__main__":
    main()