Use the final csv files for train and test and use various models to train and score them

# 0. Import and Load

In [1]:
import pandas as pd
import numpy as np

# models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import lightgbm

# processing
from sklearn.model_selection import train_test_split

# parameters search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# scoring
import math
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# viz
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.pyplot import figure

# explain
import shap
import datetime

# save model
import pickle
import os
import joblib


# Import constants
import config


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


# 1. Get the Train and Test Dataframe

## 1.1 Combine the Three Buildings Data and Save as CSV.

In [2]:
# Function to load and add dummy variable
def load_and_add_dummy(dir_path, file_name, dummy_name):
    print(f"loading from....: {dir_path}/{file_name}.csv")
    df = pd.read_csv(f"{dir_path}/{file_name}.csv")
    df[dummy_name] = 1
    return df

def combine_csv(save_dir_name, save_file_name):
    """
    Function to combine separate buildings csv into 1 csv and add extra 5 columns that sums each of the facade's
    short and long wave radiation. 
    I
    """
    # Load files and add dummy variables
    psychology_north_df = load_and_add_dummy(save_dir_name, "Psychology_North", config.PSYCHOLOGY_NORTH)
    istb_4_df = load_and_add_dummy(save_dir_name, "Istb_4", config.ISTB4)
    psychology_df = load_and_add_dummy(save_dir_name, "Psychology", config.PYSCHOLOGY)
    combined_df = pd.concat([istb_4_df, psychology_north_df, psychology_df], ignore_index=True)

    # Fill NaNs in dummy columns with 0
    dummy_columns = [ config.ISTB4, config.PYSCHOLOGY, config.PSYCHOLOGY_NORTH]
    combined_df[dummy_columns] = combined_df[dummy_columns].fillna(0).astype(int)

    # Convert the 'DateTime' column to datetime type
    combined_df[config.DATE_TIME] = pd.to_datetime(combined_df[config.DATE_TIME])

    # Remove the bldgname column and CHWTON since we are predicting CHWTON/SQM
    combined_df.drop([config.BLDGNAME, "CHWTON"], axis=1, inplace=True)

    # Add columns for the sum of Short and Long wave
    # Sum of shortwave and longwave for Top
    combined_df['SumW_Top'] = combined_df['ShortW_Top'] + combined_df['LongW_Top']

    # Sum of shortwave and longwave for East
    combined_df['SumW_East'] = combined_df['ShortW_East'] + combined_df['LongW_East']

    # Sum of shortwave and longwave for South
    combined_df['SumW_South'] = combined_df['ShortW_South'] + combined_df['LongW_South']

    # Sum of shortwave and longwave for West
    combined_df['SumW_West'] = combined_df['ShortW_West'] + combined_df['LongW_West']

    # Sum of shortwave and longwave for North
    combined_df['SumW_North'] = combined_df['ShortW_North'] + combined_df['LongW_North']

    # Save as csv
    combined_df.to_csv(f"{save_dir_name}/{save_file_name}.csv", index=False)

    
combine_csv(config.BASE_PATH, config.THREE_BLDGS_FILENAME)

loading from....: ../data/dataset2/Psychology_North.csv
loading from....: ../data/dataset2/Istb_4.csv
loading from....: ../data/dataset2/Psychology.csv


## 1.2 Save Train and Test Data as CSV (Test will be all data from July 7th). 

In [3]:
# Load the original csv before broken down to test, train.
three_bldgs_df = pd.read_csv(f"{config.BASE_PATH}/{config.THREE_BLDGS_FILENAME}.csv")

# Set the 'DateTime' column as the index of the DataFrame.
three_bldgs_df.set_index(config.DATE_TIME, inplace=True)

# Get data for July 7th 2023 and save as test_df. 
three_bldgs_df.index = pd.to_datetime(three_bldgs_df.index)
test_df = three_bldgs_df.loc[config.TEST_DATE]

# Get the remaining data for training (all data that's not in test_df).
train_df = three_bldgs_df.drop(test_df.index)

# Save the test and training sets to new CSV files
test_df.to_csv(config.TEST_FILE_PATH)
train_df.to_csv(config.TRAIN_FILE_PATH)

train_df.columns

Index(['KW', 'KW/SQM', 'CHWTON/SQM', 'HTmmBTU', 'HTmmBTU/SQM', 'AirT_North',
       'AirT_East', 'AirT_South', 'AirT_West', 'AirT_Mean', 'RelH_Mean',
       'AbsH_Mean', 'ShortW_North', 'ShortW_East', 'ShortW_South',
       'ShortW_West', 'ShortW_Top', 'LongW_North', 'LongW_East', 'LongW_South',
       'LongW_West', 'LongW_Top', 'Shade_North', 'Shade_East', 'Shade_South',
       'Shade_West', 'Shade_Top', 'Area_North', 'Area_East', 'Area_South',
       'Area_West', 'Area_Top', 'SumW_North', 'SumW_East', 'SumW_South',
       'SumW_West', 'SumW_Top', 'bldgname_ISTB 4', 'bldgname_Psychology North',
       'bldgname_Psychology'],
      dtype='object')

# 2. Data Class

In [4]:
class Data(object):
    """
    This class encapsulates the datas that we will need for training, testing, and scoring.
    It contains getters for the train and test data.
    It also contains a method that creates a df with bldgname column (undummified df) and prediction, actual columns
    """
    def __init__(self, train_file_path, test_file_path, dropped_cols=[]):
        """
        Parameters:
            train_file_path (str) : The file path for the trainning csv file. 
            test_file_path (str) : The path for the test csv file. 
        
        Both train and test datas have columns with buildings already encoded.
        """
        
        # 0. Load training data from csv and drop the Y variable for training.
        self.train_val_df = pd.read_csv(train_file_path, index_col=0)
        dropped_cols.extend([config.CHWTON_SQM])
            
        # 1. Get X and y train and validation dataframe.
        self.X_train_val = self.train_val_df.drop(columns=dropped_cols)
        self.y_train_val = self.train_val_df[config.CHWTON_SQM]  
        X_train, X_val, y_train, y_val = train_test_split(self.X_train_val, 
                                                      self.y_train_val, 
                                                      test_size=0.3, 
                                                      random_state=20)
        
        # 2. Get X and y test data frame.
        self.test_df = pd.read_csv(test_file_path, index_col=0)
        self.X_test = self.test_df.drop(columns=dropped_cols)
        self.y_test = self.test_df[config.CHWTON_SQM] 
        
        # 3. Undummify df to get data frame that only has "index" and "bldgname" which we will use for plotting.
        # Drop all the columns apart from 'bldgname' and use date_time as index.
        cols_to_keep = [config.BLDGNAME]
        
        # A. Undummify for Validation
        self.df_undum_val = self.undummify(X_val)
        self.df_undum_val = self.df_undum_val.drop(columns=self.df_undum_val.columns.difference(cols_to_keep))
        
        # B. Undummify for Test
        self.df_undum_test = self.undummify(self.X_test)
        self.df_undum_test = self.df_undum_test.drop(columns=self.df_undum_val.columns.difference(cols_to_keep))

        
        # 4. Finalise X train and X test data by removing redundant ISTB4 column since it's dummified.
#         dropped_cols = [ISTB4]
#         self.X_train_val = self.train_val_df.drop(columns=dropped_cols)
#         self.X_test = self.test_df.drop(columns=dropped_cols)
            
        print("final X train val=", self.X_train_val.columns)

        
    def undummify(self, df, prefix_sep="_"):
            """
            Inner function to undummify pandas df.
            
            Return:
                undummified_df: the undummied df containing only date_time as index and bldgname column
            """
            # 1. drop all columns apart from thos that has bldgname.
            # List of all column names
            all_columns = df.columns

            # List of columns that start with 'bldgname'
            bldgname_columns = [col for col in all_columns if col.startswith('bldgname')]

            # Drop all columns except those that start with 'bldgname'
            df = df[bldgname_columns]
            
            cols2collapse = {
                item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
            }
            series_list = []
            for col, needs_to_collapse in cols2collapse.items():
                if needs_to_collapse:
                    undummified = (
                        df.filter(like=col)
                        .idxmax(axis=1)
                        .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                        .rename(col)
                    )
                    series_list.append(undummified)
                else:
                    series_list.append(df[col])
            undummified_df = pd.concat(series_list, axis=1)
            return undummified_df
        
    def get_xy_trainval(self):
        """
        Return the X and y for training data which we can split to train and validation data later.
        """
        return self.X_train_val, self.y_train_val
    
    def get_xy_test(self):
        """
        Return the X and y for test data
        """
        return self.X_test, self.y_test
    
    def create_prediction_actual_df_input(self, y_pred, y_actual, df):
        df_undum = self.undummify(df)

        # 1. Add new predict column.
        first_plot_name = "Predicted"
        df_undum[first_plot_name] = y_pred
        
        # 2
        second_plot_name = "Actual"
        df_undum[second_plot_name] = y_actual
 

        df_bldgs = []
        # 4. split df by bldgname
        for bldgname in df_undum[config.BLDGNAME].unique():
            # - get the df by name
            df_bldg = df_undum[df_undum[config.BLDGNAME] == bldgname]

            # - only get bldgname, predicted, and actual column.
            df_bldg = df_bldg[[config.BLDGNAME, first_plot_name, second_plot_name]]

            # - save this df in list
            df_bldgs.append(df_bldg)

        return [df_undum, df_bldgs]
    
    def create_prediction_actual_df(self, y_pred, y_actual, is_val=False):
        """
        Create 4 dataframes that consist of "bldgname, base_prediction, scenario_prediction/actual" columns. 
        it will be returned in the following format:
        [df_combined, [df_bldg1, df_bldg2, df_bldg3]]
        """
        
        # 1. Create undumify df we want to use.
        if is_val:
            df_undum = self.df_undum_val.copy()
        else:
            df_undum = self.df_undum_test.copy()


        # 2. Add new predict column.
        first_plot_name = "Predicted"
        df_undum[first_plot_name] = y_pred

        
        # 3. set the second column
        second_plot_name = "Actual"
        df_undum[second_plot_name] = y_actual
        
        # 4. split df by bldgname
        df_bldgs = []
        for bldgname in df_undum[config.BLDGNAME].unique():
            # - get the df by name
            df_bldg = df_undum[df_undum[config.BLDGNAME] == bldgname]
            
            # - only get bldgname, predicted, and actual column.
            df_bldg = df_bldg[[config.BLDGNAME, first_plot_name, second_plot_name]]
            
            # - save this df in list
            df_bldgs.append(df_bldg)
            
            
        return [df_undum, df_bldgs]

# 3. Train Test Class

In [5]:
class TrainTest(object):
    """
    This class encapsulates the the training, testing, and plotting process.
    It stores the train and test datas that's already split to X and y.
    It also stores all the models that has been trained with this dataset.
    """
    def __init__(self, X_train_val, y_train_val, X_test, y_test):
        
        # - scores_df to display the scores for all our models
        self.columns=['model','r2_val', 'r2_test', 'rmse_test','mbe_test']
        self.scores_df= pd.DataFrame(columns=self.columns)
        
        # - train and test data
        self.X_train_val = X_train_val
        self.y_train_val = y_train_val
        self.X_test = X_test
        self.y_test = y_test
        self.model_list = []
            
    def get_scores_df(self):
        return self.scores_df
    
    
    def train_and_store_score(self, model, model_name):
        """
        This function will train the model given as parameter using the training data. It will compute the r2 validation score
        and append this as a new row to scores_df.
        
        Parameters:
            model (regressor model) : The model object that will be trained and used in validation.
                It can be RF, XGB, LGBM, or catboost regressor
                
            model_name (str) : the name of the model displayed in scores_df
            
        Return:
            model : This is relevant if we are doing randomized search. it will return the best model
        """
        print("\nmodel_name:", model_name)
        # 1. Train-Val Split
        X_train, X_val, y_train, y_val = train_test_split(self.X_train_val, 
                                                          self.y_train_val, 
                                                          test_size=0.3, 
                                                          random_state=20)
 
        # 2. Fit model and time the training time.
        start_time = datetime.datetime.now()

        model.fit(X_train, y_train)
        end_time = datetime.datetime.now()
        
        dur_s = (end_time - start_time).total_seconds()

        print("training duration:", dur_s)
        
        # - Get best params if it's a random or grid search
        if(("random" in model_name)) or ("grid" in model_name):
            model = model.best_estimator_
            
        # 3. Get validation R2 score.
        val_r2 = model.score(X_val, y_val)
        
        # 4. Store the training scores.
        new_row_data = {'model':model_name, 
                        "r2_val":val_r2, 
                        "r2_test":0, 
                        'rmse_test':0, 
                        'mbe_test':0, 
                        'train_time_s': dur_s}
        new_row = pd.DataFrame.from_records([new_row_data])
        self.scores_df = pd.concat([self.scores_df, new_row])
        
        return model
    
    def get_MBE(self, y_true, y_pred):
        '''
        Parameters:
            y_true (array): Array of observed values
            y_pred (array): Array of prediction values

        Returns:
            mbe (float): Bias score
        '''
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        y_true = y_true.reshape(len(y_true),1)
        y_pred = y_pred.reshape(len(y_pred),1)   
        diff = (y_pred-y_true)
        mbe = diff.mean()
        return mbe

        
    def test_and_store_score(self, model, model_name):
        """
        This function will use the given trained model to compute the y_pred using the X_test data.
        It will then compute the mbe, r2, and rmse result and insert it to scores_df. 
        
        Parameters:
            model (regressor model): The model that has been trained and will be used to predict y using the test data
                It can be RF, XGB, LGBM, or catboost regressor
            model_name (string): the name of the model displayed in scores_df
        """
        
        # print(model.get_params())
        # 1. Get prediction for the test data
        y_pred = model.predict(self.X_test)
        
        # 2. Get the three scores
        r2 = r2_score(self.y_test, y_pred)
        rmse = math.sqrt(mean_squared_error(self.y_test, y_pred))
        mbe = self.get_MBE(self.y_test, y_pred)
        
        # 3. Update scores_df with the 3 scores above
        row_to_update = self.scores_df["model"] == model_name
        
        print(f"test_score: r2={r2}, rmse={rmse}, mbe={mbe}")
        
        if row_to_update.empty:
            # Insert new row with all the values and test scores if it doesn't exist yet.
            new_row_data = {
                'model':model_name, 
                "r2_val":0, 
                "r2_test":r2, 
                'rmse_test':rmse, 
                'mbe_test':mbe, 
                'train_time_s': 0}
            new_row = pd.DataFrame.from_records([new_row_data])
            self.scores_df = pd.concat([self.scores_df, new_row])
            
        else:
            # Append just the test score.
            col_to_update = ['r2_test','rmse_test', 'mbe_test']
            self.scores_df.loc[row_to_update, col_to_update] = [r2, rmse, mbe]
        
        
    def train_test_models(self, model_list):
        """
        This function will take a list of models and train them. if the model is from randomized search, it will
        return the model with best params. This best model will be used to test and store score.
        
        Parameters:
            model_list (regressor models): list of all models that will be trained, tested, and plot the SHAP values
                It can be RF, XGB, LGBM, or catboost regressor        
        """
        
        for model_name, model in model_list:
            # - We need to do this in case its randomized search.
            best_model = self.train_and_store_score(model, model_name)
            
            # - Assign the model to model_list.
            self.model_list.append((best_model, model_name))
            
            self.test_and_store_score(best_model, model_name)
            
    
    def get_all_models(self):
        return self.model_list
        

# 5. Model Class

In [6]:
class Model():
    def __init__(self, name, base_model, random_grid, cv, n_iter):
        # - base model
        self.base_name = name + "_base"
        self.base = base_model
        
        # - randomized search model
        self.random_name = name + "_random"
        self.random = RandomizedSearchCV(
                            estimator = self.base,
                            param_distributions = random_grid,
                            n_iter = n_iter,
                            cv = cv,
                            verbose = 0,
                            random_state = config.RANDOM_STATE,
                            scoring ='r2',
                            n_jobs = -1)
        
        # - grid search model
        self.grid_name = name + "_grid"
        self.reg_grid = None
        
        
    def set_get_reg_grid(self, grid_param):
        self.reg_grid = GridSearchCV(
                        estimator =self.base,
                        param_grid = grid_param,
                        cv = self.cv,
                        scoring ='r2',
                        n_jobs = -1)
        
        return self.reg_grid
    
    
    # model getters
    def get_base_model(self):
        return self.base

    def get_random_model(self):
        return self.random
    
    def get_grid_model(self):
        return self.grid
    
    # name getters
    def get_base_name(self):
        return self.base_name
    
    def get_random_name(self):
        return self.random_name

    
    def get_grid_name(self):
        return self.grid_name
    
    

# 7. Train, Test, and Save Scores for All Models.

## Note:
1. Drop istb_4 column since its redundant for all models apart from RF.
2. For each of the models, we have base model and randomized tuned model.

## 7.1. Train, Test, and Save Scores for Random Forest.

### 7.1.0 Prepare the Combinations of Columns to Drop.

In [7]:
# Additional variables on top of those from first publication.
EXTRA_VARIABLES=[
    'KW/SQM', 
    'HTmmBTU/SQM',
    'RelH_Mean',   
    'ShortW_Top', 
    'Shade_Top', 
    'AirT_North', 'AirT_East', 'AirT_South', 'AirT_West',
    'LongW_North', 'LongW_East', 'LongW_South', 'LongW_West', 'LongW_Top', 
    'SumW_North', 'SumW_East', 'SumW_South', 'SumW_West', 'SumW_Top',
    'Area_North', 'Area_East', 'Area_South','Area_West', 'Area_Top', 
]


# Declare lists of columns that we want to drop for each train-test execution and 
# the title to summarize the columns we are dropping.
# Rel Humidity
REL_HUMID_COLUMN = ['RelH_Mean']

# KW and HTTMBTU
KW_SQM_COLUMN = ['KW/SQM']
HTMMBTU_SQM_COLUMN = ['HTmmBTU/SQM']
KW_COLUMN = ['KW']
HTMMBTU_COLUMN = ['HTmmBTU']

# Top
SHORT_WAVE_TOP_COLUMN = ['ShortW_Top']
SHADE_TOP_COLUMN = ['Shade_Top']

# AirTemp
AIR_TEMP_FACADE_COLUMNS = ['AirT_North', 'AirT_East', 'AirT_South', 'AirT_West']
AREA_COLUMNS = ['Area_North', 'Area_East', 'Area_South', 'Area_West', 'Area_Top']

# Long and Short wave
LONG_WAVE_COLUMNS = ['LongW_North', 'LongW_East', 'LongW_South', 'LongW_West', 'LongW_Top']
SHORT_WAVE_COLUMNS = ['ShortW_North','ShortW_East', 'ShortW_South', 'ShortW_West']
SUM_WAVE_COLUMNS = ['SumW_North', 'SumW_East', 'SumW_South', 'SumW_West', 'SumW_Top']

# We also want to test not using short wave and just use sum of short and long wave instead.
EXTRA_AND_SHORT_WAVE = EXTRA_VARIABLES + SHORT_WAVE_COLUMNS

# To include.
sumWave_area_airtempFacade_shortWaveTop_shadeTop = SUM_WAVE_COLUMNS + AREA_COLUMNS + AIR_TEMP_FACADE_COLUMNS + SHORT_WAVE_TOP_COLUMN + SHADE_TOP_COLUMN


dropped_title_columns_list = [

    # OPTION1: Use SQM for HTMMBTU and KW.    
    # ("area_dropped", AREA_COLUMNS),
    
    # ("airtemp_facade_dropped", AIR_TEMP_FACADE_COLUMNS + AREA_COLUMNS),

    ("ali_proposal", AIR_TEMP_FACADE_COLUMNS + AREA_COLUMNS + SUM_WAVE_COLUMNS + LONG_WAVE_COLUMNS + HTMMBTU_COLUMN + HTMMBTU_SQM_COLUMN + KW_COLUMN + REL_HUMID_COLUMN ),
    
    # Use sumwave columns.
    # ("area_airtemp_facade_dropped_3", AIR_TEMP_FACADE_COLUMNS + AREA_COLUMNS + HTMMBTU_COLUMN + KW_COLUMN ),

    # OPTION2: Use NON-SQM for HTMMBTU and KW (As in the previous publication).
    # 3. Use non sqm for htmmbtu and kw
    # ("area_airtemp_facade_dropped_4", AIR_TEMP_FACADE_COLUMNS + AREA_COLUMNS + HTMMBTU_SQM_COLUMN + KW_SQM_COLUMN),

    # 4. Drop all columns in extra variables except for relative humidity.
    # ("match_previous_data_incl_rel_humid", [col for col in EXTRA_VARIABLES if col !=REL_HUMID_COLUMN[0]]),
    
    # 5. Drop all columns in extra variables except for the sumwave columns.
    # ("match_previous_data_incl_sumwave", [col for col in EXTRA_VARIABLES if col not in SUM_WAVE_COLUMNS]),
    
    # 6. Drop all columns in extra variables except for sumwave, area, airtemp facade, shortWave top and shade top. 
    # ("match_previous_data_incl_sumwave_area_airtempFacade_shortWTop_shadeTop", [col for col in EXTRA_VARIABLES if col not in sumWave_area_airtempFacade_shortWaveTop_shadeTop]),

    # 7. Drop all columns in extra variables.
    # ("match_previous_data", EXTRA_VARIABLES ), 
]


In [8]:
# Intialize all models with base and randomized search version.
RF = Model(config.rf_name, config.rf_base, config.rf_param, config.CV, config.N_ITER)
XGB = Model(config.xgb_name, config.xgb_base, config.xgb_param, config.CV,config. N_ITER)
LGBM = Model(config.lgbm_name, config.lgbm_base, config.lgbm_param, config.CV, config.N_ITER)
CB = Model(config.cb_name, config.cb_base, config.cb_param, config.CV, config.N_ITER)
# model_objects = [RF, XGB, LGBM, CB]
model_objects = [CB]

# Iterate through each of the dropped columns list and execute train, test, and save scores to csv.
for dropped_title, dropped_cols in dropped_title_columns_list:
    print("\nDropped title: ", dropped_title)
    
    # 0. Initialize data objects.
    data_obj = Data(config.TRAIN_FILE_PATH, config.TEST_FILE_PATH, dropped_cols)
        
    # 1. Get the validation and test data.
    X_train_val, y_train_val = data_obj.get_xy_trainval()
    X_test, y_test = data_obj.get_xy_test()
    print(X_train_val)
    
    # 2. Init the trainTest object with the loaded datas.
    tt = TrainTest(X_train_val, y_train_val, X_test, y_test)
    
    # 3. Get all models.
    all_base_random = []
    for model in model_objects:
        # Get the base and randomized search version of each model.
        base_name, base_model = model.get_base_name(), model.get_base_model()
        random_name, random_model = model.get_random_name(), model.get_random_model()
        all_base_random.extend([(base_name, base_model), (random_name, random_model)])


    # 4. Train and test.
    tt.train_test_models(all_base_random)
    
    # 5. Save results. 
    scores_df = tt.get_scores_df()
    scores_df.to_csv(f"{config.RESULT_DIR_PATH}/scores_{dropped_title}.csv", index=False)
    


Dropped title:  ali_proposal
final X train val= Index(['KW/SQM', 'AirT_Mean', 'AbsH_Mean', 'ShortW_North', 'ShortW_East',
       'ShortW_South', 'ShortW_West', 'ShortW_Top', 'Shade_North',
       'Shade_East', 'Shade_South', 'Shade_West', 'Shade_Top',
       'bldgname_ISTB 4', 'bldgname_Psychology North', 'bldgname_Psychology'],
      dtype='object')
                       KW/SQM  AirT_Mean  AbsH_Mean  ShortW_North  \
Date_Time                                                           
2023-05-03 05:00:00  0.041512  22.848720   4.987361           0.0   
2023-05-03 05:15:00  0.042342  22.670915   5.452283           0.0   
2023-05-03 05:30:00  0.041332  22.377459   5.329339           0.0   
2023-05-03 05:45:00  0.041755  22.221263   5.312129           0.0   
2023-05-03 06:00:00  0.041877  22.055966   5.288323           0.0   
...                       ...        ...        ...           ...   
2023-07-18 13:30:00  0.025110  43.867376   9.304210           0.0   
2023-07-18 13:45:00  0.02

The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


training duration: 78.863169
test_score: r2=0.9509618951809503, rmse=0.0011825350641268, mbe=0.0001800143597989586


# Test getting important features.

In [None]:
# # 3. init model object
# RF = Model(rf_name, rf_base, rf_random_grid, CV, N_ITER)
# XGB = Model(xgb_name, xgb_base, xgb_random_grid, CV, N_ITER)
# LGBM = Model(lgbm_name, lgbm_base, lgbm_random_grid, CV, N_ITER)
# # CB = Model(catboost_name, catboost_base, catboost_random_grid, CV, N_ITER)

# all_base_random = []

# # 5. add the rest
# # model_objects = [XGB, LGBM, CB]
# model_objects = [RF, XGB, LGBM]
# for model in model_objects:
#     base_name, base_model = model.get_base_name(), model.get_base_model()
#     random_name, random_model = model.get_random_name(), model.get_random_model()
#     all_base_random.extend([(base_name, base_model), (random_name, random_model)])


# # 6. train, test all models
# tt.train_test_models(all_base_random)

# 9. Save all Models

In [None]:
all_models = tt.get_all_models()
# set file 
lgbm = None
for model, name in all_models:
    print("name=", name)
    # 1. Print the actual scores.
    tt.test_and_store_score(model, name)

    # 2. Create the path to save the models.
    save_path = config.MODEL_DIR_PATH
    isExist = os.path.exists(save_path)

    if not isExist:
       # Create a new directory if it does not exist
       os.makedirs(save_path)

    # 3. Save the models.
    if "lgbm" in name:
        print("lgbm>>")
        # Set the filename.
        filename = name + ".pkl"

        #  Save to the model to filepath.
        joblib.dump(model, f"{save_path}/{filename}")


        # Reload and check if the scores matches.
        model = joblib.load(save_path + filename)
        tt.test_and_store_score(model, name)

    else:
        # Set the filename.
        filename = name + ".sav"

        # Save to the model to filepath
        pickle.dump(model, open(f"{save_path}/{filename}", 'wb'))


    