In [3]:
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import seaborn
import os
from typing import Any

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

# Feature importance library
from sklearn.feature_selection import RFECV
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_regression
from feature_engine.selection import SelectByShuffling
from feature_engine.selection import SelectBySingleFeaturePerformance


from joblib import dump
from joblib import load
# Local config
import config_dataset3 as config

In [15]:

import pandas as pd
import os

# Load the dataframes from the pickle file
PKL_PATH = "../data/dataset3/dfs_radx_scen.pkl"
dfs = pd.read_pickle(PKL_PATH)

# Function to convert scenario names to lower case camel
def to_lower_camel(s):
    return s.lower()

# Updated function to save dataframes as CSV, with scenario names in lower camel case
def save_dfs_as_csv(dfs, base_path="../data/dataset3/scenarios"):
    for building_name, df in dfs.items():
        building_name = building_name.replace(" ", "_")
        print("bldg_name", building_name)
            
        # Group by the 'Scenario' column
        for scenario_name, df in df.groupby('Scenario'):
            df.reset_index(inplace=True)
            
            # Rename all columns that have RadX to ProductW
            df = df.rename(columns=lambda x: x.replace('RadX', 'ProductW'))
            # Convert scenario name to lower case camel
            scenario_name_clean = to_lower_camel(scenario_name)
            
            # Define the directory path for the current scenario of the current building
            dir_path = os.path.join(base_path, scenario_name_clean)
            
            # Create the directory if it does not exist
            os.makedirs(dir_path, exist_ok=True)
            
            # Define the file path for the CSV file
            file_path = os.path.join(dir_path, f"{building_name}.csv")
            print(df.columns)
            # Save the grouped dataframe as a CSV file
            df.to_csv(file_path, index=False)

# Save each dataframe as a CSV
save_dfs_as_csv(dfs)

bldg_name Psychology
Index(['Date_Time', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Building',
       'Scenario', 'KW', 'KW/SQM', 'CHWTON', 'CHWTON/SQM', 'HTmmBTU',
       'HTmmBTU/SQM', 'AirT_Top', 'AirT_North', 'AirT_East', 'AirT_South',
       'AirT_West', 'AirT_Mean', 'RelH_Top', 'RelH_North', 'RelH_East',
       'RelH_South', 'RelH_West', 'RelH_Mean', 'AbsH_Top', 'AbsH_North',
       'AbsH_East', 'AbsH_South', 'AbsH_West', 'AbsH_Mean', 'Wind_Top',
       'Wind_North', 'Wind_East', 'Wind_South', 'Wind_West', 'Wind_Mean',
       'AirP_Top', 'AirP_North', 'AirP_East', 'AirP_South', 'AirP_West',
       'ShortW_Top', 'ShortW_North', 'ShortW_East', 'ShortW_South',
       'ShortW_West', 'LongW_Top', 'LongW_North', 'LongW_East', 'LongW_South',
       'LongW_West', 'RadT_Top', 'RadT_North', 'RadT_East', 'RadT_South',
       'RadT_West', 'Shade_Top', 'Shade_North', 'Shade_East', 'Shade_South',
       'Shade_West', 'ProductW_Top', 'ProductW_North', 'ProductW_East',
       'ProductW_South', '

In [16]:
# Load the dataframes from the pickle file
PKL_PATH = "../data/dataset3/dfs_radx.pkl"
dfs = pd.read_pickle(PKL_PATH)

# Updated function to save dataframes as CSV, with scenario names in lower camel case
def save_dfs_as_csv(dfs, base_path="../data/dataset3"):
    for building_name, df in dfs.items():
        building_name = building_name.replace(" ", "_")
        df.reset_index(inplace=True)
        
        df = df.rename(columns=lambda x: x.replace('RadX', 'ProductW'))
        
        # Define the file path for the CSV file
        file_path = os.path.join(base_path, f"{building_name}.csv")

        # Save the grouped dataframe as a CSV file
        df.to_csv(file_path, index=False)

# Save each dataframe as a CSV
save_dfs_as_csv(dfs)

Index(['Date_Time', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Building', 'KW',
       'KW/SQM', 'CHWTON', 'CHWTON/SQM', 'HTmmBTU', 'HTmmBTU/SQM', 'AirT_Top',
       'AirT_North', 'AirT_East', 'AirT_South', 'AirT_West', 'AirT_Mean',
       'RelH_Top', 'RelH_North', 'RelH_East', 'RelH_South', 'RelH_West',
       'RelH_Mean', 'AbsH_Top', 'AbsH_North', 'AbsH_East', 'AbsH_South',
       'AbsH_West', 'AbsH_Mean', 'Wind_Top', 'Wind_North', 'Wind_East',
       'Wind_South', 'Wind_West', 'Wind_Mean', 'AirP_Top', 'AirP_North',
       'AirP_East', 'AirP_South', 'AirP_West', 'ShortW_Top', 'ShortW_North',
       'ShortW_East', 'ShortW_South', 'ShortW_West', 'LongW_Top',
       'LongW_North', 'LongW_East', 'LongW_South', 'LongW_West', 'RadT_Top',
       'RadT_North', 'RadT_East', 'RadT_South', 'RadT_West', 'Shade_Top',
       'Shade_North', 'Shade_East', 'Shade_South', 'Shade_West',
       'ProductW_Top', 'ProductW_North', 'ProductW_East', 'ProductW_South',
       'ProductW_West'],
      dtype='obj

In [None]:
def load_or_train_selector(load_selector: bool, 
                 selector: Any,
                 selector_file_path: str, 
                 X: pd.DataFrame, 
                 y: pd.DataFrame, 
                ) -> Any:
    
    # Load from path
    if load_selector:
        selector = load(selector_file_path)
        
    # Train and Save selector as joblib file.
    else:
        selector = selector.fit(X, y)
        dump(selector, selector_file_path)
        
    return selector


def print_result(X: pd.DataFrame, 
                 y: pd.DataFrame, 
                 RFECV_selector: RFECV, 
                 shuffle_selector: SelectByShuffling,
                 single_selector: SelectBySingleFeaturePerformance,
                 model: RandomForestRegressor, N=8) -> None:
    """
    Print Important Features result from:
    1) Selector RFECV
    2) Sklearn feature importances
    3) Permutation importance
    4) Mutual Info Regression
    5) Feature Engine's Select by Shuffling
    6) Feature Engine's Select by Single Feature Performance
    """
    # 1. RFE Result
    print(f"1a) Optimal number of features by RFE: {RFECV_selector.n_features_}")
    
    # Create a Series of the feature rankings, with feature names as the index
    feature_rankings = pd.Series(RFECV_selector.ranking_, index=X.columns)
    
    # Sort the Series to get features sorted by their ranking
    sorted_features_by_rank = feature_rankings.sort_values()
    print(f"\n1b) Feature Ranking RFE:")
    for feature, rank in sorted_features_by_rank.items():  # Using items() instead of iteritems()
        print(f"{feature:20} {rank}")


    # 2. Select features based on importance threshold
    print(f"\n2) Selected features by feature_importances:")
    threshold = 0.01
    importances = model.feature_importances_

    # Create a Series with feature importances
    feature_importances = pd.Series(importances, index=X.columns)

    # Filter features by the threshold and sort them
    selected_features = feature_importances[feature_importances > threshold].sort_values(ascending=False)
    max_name_length = len("LongW_North") + 2
    for feature, importance in selected_features.items():
        print(f"{feature:{max_name_length}} {importance:.4f}")
    
    # 3. Permutation.
    print(f"\n3) Top {N} Features by Permutation Importance:")
    result = permutation_importance(model, X, y, n_repeats=10, random_state=0)
    perm_sorted_idx = result.importances_mean.argsort()[::-1]  # Sort in descending order of importance
    for idx in perm_sorted_idx[:N]:
        print(f"{X.columns[idx]:{max_name_length}} {result.importances_mean[idx]:.4f}")
    
    # 4. Mutual Info Regression.
    print(f"\n4) Top {N} Features by Mutual Info Regression:")
    mutual_info = mutual_info_regression(X, y)
    feature_importances = pd.Series(mutual_info, index=X.columns)
    sorted_features = feature_importances.sort_values(ascending=False)
    print(sorted_features[:N])

    # 5. Shuffle Selector.
    features_to_drop_names = shuffle_selector.features_to_drop_
    good_features_names = [feature for feature in X.columns if feature not in features_to_drop_names]
    print(f"\n5) Good Features by Shuffle Selector:")
    for feature in good_features_names:
        print(f"{feature}")

    # 6. Single Feature Selector.
    features_to_drop_names = single_selector.features_to_drop_
    good_features_names = [feature for feature in X.columns if feature not in features_to_drop_names]
    print(f"\n6) Good Features by Single Feature Performance Selector:")
    for feature in good_features_names:
        print(f"{feature}")



# 1. Recursive Feature Elimination (RFE), Feature Importance, and Permutation Importance:

## 1.1 Separate Bldgs

In [None]:
# Separate
for bldgname in config.BLDGNAMES:
    df = pd.read_csv(f"{config.BASE_PATH}/{bldgname}.csv")
    df.set_index('Date_Time', inplace=True)

    X = df.drop(['CHWTON','CHWTON/SQM', 'Building'], axis=1)
    y = df['CHWTON/SQM']

    cv = KFold(n_splits=config.CV)
    
    # 1. Load Selectors and Model or train them first.
    if not os.path.exists(config.SELECTORS_DIR_PATH):
        os.makedirs(config.SELECTORS_DIR_PATH)
    print(f"\n=======Result for {bldgname}=======")
    RFECV_selector = RFECV(config.rf_base, step=1, cv=cv, scoring='neg_mean_squared_error')
    RFECV_file_path = f'{config.SELECTORS_DIR_PATH}/RFECV_{bldgname}.joblib'
    RFECV_selector = load_or_train_selector(True, RFECV_selector, RFECV_file_path, X, y)

    shuffle_selector = SelectByShuffling(estimator=config.rf_base, scoring='neg_mean_squared_error', cv=cv, random_state=42)
    shuffle_file_path = f'{config.SELECTORS_DIR_PATH}/shuffle_{bldgname}.joblib'
    shuffle_selector = load_or_train_selector(True, shuffle_selector, shuffle_file_path, X, y)

    single_selector = SelectBySingleFeaturePerformance(estimator=config.rf_base, scoring="r2", cv=cv, threshold=0.01)
    single_file_path = f'{config.SELECTORS_DIR_PATH}/single_{bldgname}.joblib'
    single_selector = load_or_train_selector(True, single_selector, single_file_path, X, y)

    model_file_path = f'{config.SELECTORS_DIR_PATH}/model_{bldgname}.joblib'
    model = load_or_train_selector(True, config.rf_base, model_file_path, X, y)
            
    print_result(X, y, RFECV_selector, shuffle_selector, single_selector, model)
    

## 1.2 Combined

In [None]:
dataframes = []
for bldgname in config.BLDGNAMES:
    df = pd.read_csv(f"{config.BASE_PATH}/{bldgname}.csv")

    df['Building'] = bldgname  # Ensure there's a 'Building' column to identify the data source
    dataframes.append(df)

# Combine all building data into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Apply one-hot encoding to the 'Building' column
combined_df = pd.get_dummies(combined_df, columns=['Building'])

# Save the combined DataFrame to a CSV file
COMBINED = "combined"
combined_csv = f"{config.BASE_PATH}/{COMBINED}.csv"
combined_df.to_csv(combined_csv, index=False)

df = pd.read_csv(combined_csv)
df.set_index('Date_Time', inplace=True)
X = df.drop(['CHWTON','CHWTON/SQM'], axis=1)
y = df['CHWTON/SQM']

RFECV_file_path = f'{config.SELECTORS_DIR_PATH}/RFECV_{COMBINED}.joblib'
RFECV_selector = RFECV(config.rf_base, step=1, cv=cv, scoring='neg_mean_squared_error')
RFECV_selector = load_or_train_selector(True, RFECV_selector, RFECV_file_path, X, y)

shuffle_file_path = f'{config.SELECTORS_DIR_PATH}/shuffle_{COMBINED}.joblib'
shuffle_selector = SelectByShuffling(estimator=config.rf_base, scoring='neg_mean_squared_error', cv=cv, random_state=42)
shuffle_selector = load_or_train_selector(True, shuffle_selector, shuffle_file_path, X, y)

single_selector = SelectBySingleFeaturePerformance(estimator=config.rf_base, scoring="r2", cv=cv, threshold=0.01)
single_file_path = f'{config.SELECTORS_DIR_PATH}/single_{COMBINED}.joblib'
single_selector = load_or_train_selector(True, single_selector, single_file_path, X, y)

model_file_path = f'{config.SELECTORS_DIR_PATH}/model_{COMBINED}.joblib'
model = load_or_train_selector(True, config.rf_base, model_file_path, X, y)

print_result(X, y, 
             RFECV_selector, 
             shuffle_selector, 
             single_selector, 
             model)