In [1]:
import pandas as pd
import numpy as np
import datetime

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import lightgbm

# Processing
from sklearn.model_selection import train_test_split

# parameters search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Scoring
import math
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Vizualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.pyplot import figure
import os


# Local config
import config


In [2]:
class Data(object):
    """
    This class encapsulates the datas that we will need for training and testing for each building.
    It uses July 7th for the test data.
    """
    def __init__(self, bldg_name, df, dropped_cols=[]):
        """
        Parameters:
            bldgs_df_list (str) : list of dataframe with each dataframe consisting of a single building. 
            dropped_cols (str) : The path for the test csv file. 
        """
        df = df.copy()
        # Filter out the data for July 7
        df[config.DATE_TIME] = pd.to_datetime(df[config.DATE_TIME])
        test_data = df[(df[config.DATE_TIME].dt.month == 7) & (df[config.DATE_TIME].dt.day == 7)]
        train_data = df[~((df[config.DATE_TIME].dt.month == 7) & (df[config.DATE_TIME].dt.day == 7))]
        
        # Cols to keep.
        model_cols = ['CHWTON/SQM','AirT_Mean', 'AbsH_Mean','ShortW_North',
              'ShortW_East', 'ShortW_South', 'ShortW_West', 'ShortW_Top',
             'Shade_North', 'Shade_East', 'Shade_South', 'Shade_West', 'Shade_Top', 'KW/SQM']

        train_data = train_data[model_cols]
        test_data = test_data[model_cols]
        
        # Prepare features and target variable
        self.bldg_name = bldg_name
        self.X_train = train_data.drop(config.CHWTON_SQM, axis=1)
        self.y_train = train_data[config.CHWTON_SQM]
        self.X_test = test_data.drop(config.CHWTON_SQM, axis=1)
        self.y_test = test_data[config.CHWTON_SQM]
        self.df = df
        

In [3]:
class Model():
    '''
    Given a base model and grid or random params, the class will create
    the search grid and assign the name to each of this model.
    '''
    def __init__(self, name, base_model, param, cv, n_iter, search_mode):
        # Base model
        self.name = name
        self.base = base_model
        self.search_mode = search_mode
        self.best = None
        
        # Randomized search model
        if search_mode == "random":
            self.clf = RandomizedSearchCV(
                estimator = self.base,
                param_distributions = param,
                n_iter = n_iter,
                cv = cv,
                verbose = 0,
                random_state = config.RANDOM_STATE,
                scoring = config.SCORING,
                n_jobs = config.N_JOBS)
            
        # Grid search model
        else:
            self.clf = GridSearchCV(
                estimator=self.base,
                param_grid = param, 
                cv = cv, 
                verbose = 0, 
                scoring = config.SCORING,
                n_jobs = config.N_JOBS)
    

In [4]:
class Scores(object):
    """
    This class stores all scores for all models for all buildings.
    """
    def __init__(self):
        # Initialized scores dataframe to store the scores for all the models trained.
        self.columns=['model', 'bldg', 'r2_train', 'r2_test', 'rmse_test','mbe_test']
        self.scores_df= pd.DataFrame(columns=self.columns)
        
    def get_MBE(self, y_true, y_pred):
        '''
        Parameters:
            y_true (array): Array of observed values
            y_pred (array): Array of prediction values

        Returns:
            mbe (float): Bias score
        '''
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        y_true = y_true.reshape(len(y_true),1)
        y_pred = y_pred.reshape(len(y_pred),1)   
        diff = (y_pred-y_true)
        mbe = diff.mean()
        return mbe
    
    def train_test_and_store_score(self, model, model_name, data):
        '''
        Function to train the model and make prediction using the data object's X_test df.
        Return the best model after training if the model input is a search classifier.
        '''
        print("\nmodel_name:", model_name)
        
        # Train and get r2 scores.
        model.fit(data.X_train, data.y_train)
        if("random" in model_name) or ("grid" in model_name):
            print("best_params=", model.best_params_)
            # reassign using the best model.
            model = model.best_estimator_
            
        r2_train = model.score(data.X_train, data.y_train)
        
        # Test and get r2, rmse, and mbe scores.
        y_pred = model.predict(data.X_test)
        r2 = r2_score(data.y_test, y_pred)
        rmse = math.sqrt(mean_squared_error(data.y_test, y_pred))
        mbe = self.get_MBE(data.y_test, y_pred)
        
        # Store all the scores.
        new_score_data = {
            'model': model_name,
            'bldg': data.bldg_name,
            "r2_train":r2_train,
            "r2_test":r2,
            'rmse_test':rmse,
            'mbe_test':mbe}
        new_score_row = pd.DataFrame.from_records([new_score_data])
        self.scores_df = pd.concat([self.scores_df, new_score_row])
        
        # Return the best model.
        return model
    


In [5]:
def plot_by_bldg_and_model(bldg_data, model_name, model):
    '''
    Function to plot the scenario prediction of CHWTON/SQM.
    '''
    # Get the prediction for the baseline data (not scenario).
    y_pred = model.predict(bldg_data.X_test)
    
    # Get the prediction for scenarios for the bldg.
    for scen in scens:
        scen_data = pd.read_csv(f'{config.SCENARIOS_DIR_PATH}/{scen}/{bldg_data.bldg_name}.csv')
        scen_data[config.DATE_TIME] = pd.to_datetime(scen_data[config.DATE_TIME])
        scen_data = scen_data[(scen_data[config.DATE_TIME].dt.month == 7) & (scen_data[config.DATE_TIME].dt.day == 7)]
        scen_data = scen_data.drop_duplicates(subset=[config.DATE_TIME])
        
        X_scen = scen_data[bldg_data.X_test.columns]
        
        # Get the prediction for scenario data.
        scen_pred = model.predict(X_scen)
        
        # Extract Date_Time for the test set
        date_time_test = bldg_data.df[(bldg_data.df[config.DATE_TIME].dt.month == 7) & (bldg_data.df[config.DATE_TIME].dt.day == 7)][config.DATE_TIME]

        # Create a new DataFrame for plotting
        plot_df = pd.DataFrame({
            config.DATE_TIME: date_time_test,
            'Actual': bldg_data.y_test,
            'Predicted': y_pred,
            'Scenario' : scen_pred
        })

        # Calculations
        sum_p = sum(plot_df['Predicted'])
        sum_s = sum(plot_df['Scenario'])
        print("\033[1m" + scen + "\033[0m")
        print('Predicted Total (CHWTON/SQM): ', sum_p)
        print('Scenario Total (CHWTON/SQM):  ', sum_s)
        percent_diff = ((sum_s - sum_p)/sum_p)*100
        print('Percent Difference: ', percent_diff)

        # Sort by Date_Time
        plot_df.sort_values(by=config.DATE_TIME, inplace=True)

        # Plotting
        plt.figure(figsize=(12, 6))
        plt.plot(plot_df[config.DATE_TIME], plot_df['Actual'], label='Actual')
        plt.plot(plot_df[config.DATE_TIME], plot_df['Predicted'], label='Predicted', alpha=0.7)
        plt.plot(plot_df[config.DATE_TIME], plot_df['Scenario'], label= scen, alpha = 0.5)
        plt.xlabel(config.DATE_TIME)
        plt.ylabel(config.CHWTON_SQM)
        plt.title(f"{bldg_data.bldg_name}/{model_name}/{scen} Predicted vs. Scenario Comparison percent_diff={percent_diff}\n")
        plt.legend()
        
        
        # Save with path: config.RESULT_DIR_PATH + "/" {model_name}/{bldg_data.bldg_name}_{scen}.png
        # Ensure the directory exists
        model_dir = os.path.join(config.RESULT_DIR_PATH, model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        # Specify the file path for saving the plot
        file_path = os.path.join(model_dir, f"{bldg_data.bldg_name}_{scen}.png")

        # Save the plot
        plt.savefig(file_path)
        plt.show()
        
def load_train_test_score_plot(bldg_name, scores):
    # 1. Load Data
    df = pd.read_csv(f"{config.BASE_PATH}/{bldg_name}.csv")
    data = Data(bldg_name, df)
    search_mode="grid"
    
    # 2. Random Forest
    rf = Model(config.rf_name, 
               config.rf_base, 
               config.rf_param,
               config.CV, 
               config.N_ITER, 
               search_mode=search_mode)
    
    # 3. XGB
    xgb = Model(
        config.xgb_name, 
        config.xgb_base, 
        config.xgb_param,
        config.CV, 
        config.N_ITER, 
        search_mode=search_mode)
    
    
    # 4. LGBM
    lgbm = Model(
        config.lgbm_name, 
        config.lgbm_base, 
        config.lgbm_param,
        config.CV, 
        config.N_ITER, 
        search_mode=search_mode)
    
    
    # 5. CATBOOST
    cb = Model(config.cb_name, 
               config.cb_base, 
               config.cb_param, 
               config.CV, 
               config.N_ITER,
               search_mode=search_mode)
    
    # models = [rf, xgb, lgbm, cb]
    models = [rf]
    
    for model in models:
        # Base model: train, test, score.
        scores.train_test_and_store_score(model.base, model.name + "_base", data)

        # CV classifier: train, test, score, plot.
        model.best = scores.train_test_and_store_score(model.clf, model.name + "_" + model.search_mode, data)
        
        # Plot using the best model.
        plot_by_bldg_and_model(data, model.name + "_" + search_mode, model.best)

In [6]:
# Set up scenarios and scores df.
scens = ['high_albedo_walls', 'cool_pavement', 'trees_surround', 'wall_shade', 'pv_sidewalks', 'pv_rooftop_and_trees', 'trees_extreme', 'pv_rooftop']
scores = Scores()


# Psychology

In [None]:
load_train_test_score_plot("Psychology", scores)
scores.scores_df


model_name: rf_base

model_name: rf_grid


  self.scores_df = pd.concat([self.scores_df, new_score_row])


# Psychology North

In [None]:
load_train_test_score_plot("Psychology_North", scores)
scores.scores_df

# ISTB 4

In [None]:
load_train_test_score_plot("Istb_4", scores)
scores.scores_df

In [None]:
scores.scores_df
