# Import and Load

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from functions import print_bold, create_pkl, load_pkl, load_latest_pkl

In [41]:
pkl_file = 'dfs'

## Load latest file for pkl_file
dfs = load_latest_pkl(pkl_file)
PSYCHOLOGY = 0
PSYCHOLOGY_NORTH = 1
ISTB_4 = 2

basePsy = pd.DataFrame(dfs['Scenarios'][ISTB_4])
basePsy
# Iterate over each group
for scenario, group in basePsy.groupby('Scenario'):
    # Format the directory name (lowercase and replace spaces with underscores for consistency)
    dir_name = f'../data/dataset3/{scenario.replace(" ", "_").lower()}'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
        
    # Define the file path
    file_path = os.path.join(dir_name, 'Istb_4.csv')
    
    # Save the group to a CSV file
    group.to_csv(file_path, index=False)  # Set 'index=False' if you don't want to save the index

    
## Define variables
# base,scen='Baseline','Scenarios'
# bld_list = ['Psychology', 'Psychology North', 'ISTB 4']



Latest pickle file (dfs.pkl) loaded successfully.


    Example Use of dfs:

    dfs['Baseline'][bld_list.index('Psychology')]
    dfs['Scenarios'][bld_list.index('Psychology North')]

# Functions and How to Use

**How to select a combination of variables for selected facades**  

Define a list then use add_lists() function, which takes in variable lists below 

    Example:  
      
    X_vars = [KW, AirT, AirP, RelH]
    fac = ['Top', 'North', 'East']
    X_vars = vars_facades(X_vars,fac,var_facades)
    

In [3]:
KWm2   = 'KW/SQM'
KW     = 'KW'
HTm2   = 'HTmmBTU/SQM'
HT     = 'HTmmBTU'
HR     = 'Hour'
AirT   = ['AirT_Top', 'AirT_North', 'AirT_East', 'AirT_South', 'AirT_West', 'AirT_Mean']
AirP   = ['AirP_Top', 'AirP_North', 'AirP_East', 'AirP_South', 'AirP_West']
RelH   = ['RelH_Top', 'RelH_North', 'RelH_East', 'RelH_South', 'RelH_West', 'RelH_Mean']
AbsH   = ['AbsH_Top', 'AbsH_North', 'AbsH_East', 'AbsH_South', 'AbsH_West', 'AbsH_Mean']
Wind   = ['Wind_Top', 'Wind_North', 'Wind_East', 'Wind_South', 'Wind_West', 'Wind_Mean']
ShortW = ['ShortW_Top', 'ShortW_North', 'ShortW_East', 'ShortW_South', 'ShortW_West', 'Shade_Top', 'Shade_North', 'Shade_East', 'Shade_South', 'Shade_West']
LongW  = ['LongW_Top', 'LongW_North', 'LongW_East', 'LongW_South', 'LongW_West']
RadT   = ['RadT_Top', 'RadT_North', 'RadT_East', 'RadT_South', 'RadT_West']
all_vars = [KWm2] + [KW] + [HTm2] + [HT] + [HR] + AirT + AirP + RelH + AbsH + Wind + ShortW + LongW + RadT

def var_facades(feature, facades):
    v_fs = []
    for fac in facades:
        v_fs = v_fs + [item for item in feature if item.split('_', 1)[1] == fac]
    return v_fs
def vars_facades(features,facades,var_facades=None):
    vs_fs = []
    if var_facades:
        for feat in features:
            if isinstance(feat, str):
                vs_fs.append(feat)
            elif isinstance(feat, list) and all(isinstance(item, str) for item in feat):
                vs_fs = vs_fs + var_facades(feat, facades)
            else:
                pass      
    return vs_fs

**How to get X and y**  

**Get X and y by giving the X_y_data() function y_var (the y variable) and X_vars (all the x variables)**  

    EXAMPLE:  
    
    y_var='CHWTON'
    X_vars=['Hour','AirT_Top','RelH_Top']
    X_base_dic,y_base_dic,X_scen_dic,y_scen_dic = X_y_data(X_vars, y_var)


**Selecting Baseline by Building Name** : :   X_base_dic['Building Name']

    EXAMPLE:
    
    X_base_dic['ISTB 4']
    y_base_dic['Psychology North']
**Selecting Scenarios by Building Name and Scenarios Name** : :  X_scen_dic['Building Name']['Scenario_Name']  

    EXAMPLE:    
    
    X_scen_dic['Psychology']['Cool_Pavement']
    X_scen_dic['Psychology']['Wall_Shade']



In [4]:
def get_X_y(dfs,X_vars,y_var,building,scen_base,scen=''):
    bld_list = ['Psychology', 'Psychology North', 'ISTB 4']
    idx = bld_list.index(building)
    df = dfs[scen_base][idx]
    if scen_base == 'Baseline':
        X,y = df[X_vars], df[y_var]
    else:
        df = df[df['Scenario'] == scen]
        X,y = df[X_vars], df[y_var]
        
    return X, y
def X_y_data(X_vars, y_vars):
    X_base_dic = {key: None for key in bld_list}
    y_base_dic = {key: None for key in bld_list}
    X_scen_dic = {key: {} for key in bld_list}
    y_scen_dic = {key: {} for key in bld_list}
    for idx,b in enumerate(bld_list):
        X_base_dic[b], y_base_dic[b] = get_X_y(dfs=dfs, 
                                               X_vars=X_vars, 
                                               y_var=y_var, 
                                               building=b, 
                                               scen_base='Baseline')
        for s in dfs[scen][idx]['Scenario'].unique():
            df = dfs[scen][idx]
            df = df[df['Scenario'] == s]
            X_scen_dic[b][s], y_scen_dic[b][s] = get_X_y(dfs=dfs, 
                                               X_vars=X_vars, 
                                               y_var=y_var, 
                                               building=b, 
                                               scen_base='Scenarios', scen=s)
    return (X_base_dic,y_base_dic,X_scen_dic,y_scen_dic)

# Examples, Feature Selection, and Getting X and y

**Get (X, y) data in (X_dic, y_dic) for list of variables for specific facades**  

Variable Options --> KWm2 , KW , HTm2 , HT , HR , AirT , AirP , RelH , AbsH , Wind , ShortW , LongW , RadT  
For All Variable --> all_vars

In [5]:
X_vars = [KW, HR, AirT, AirP, RelH]
fac = ['Top', 'North', 'East']
X_vars = vars_facades(X_vars,fac,var_facades)
y_var='CHWTON'

X_base_dic,y_base_dic,X_scen_dic,y_scen_dic = X_y_data(X_vars, y_var)

## Loop over X and y dictionaries 
for b in bld_list:
    X = X_base_dic[b]
    y = y_base_dic[b]
    # Perform X and y operations below this line
    ## code here ##
    for s in dfs['Scenarios'][0]['Scenario'].unique():
        X_s = X_scen_dic[b][s]
        y_s = y_scen_dic[b][s]
        # Perform X_s and y_s Operations below this line
        ## code here ##

In [6]:
X_base_dic['Psychology']

Unnamed: 0_level_0,KW,Hour,AirT_Top,AirT_North,AirT_East,AirP_Top,AirP_North,AirP_East,RelH_Top,RelH_North,RelH_East
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-05-03 05:00:00,115.33,5,22.753289,21.967301,22.163870,0.000087,0.000558,0.003277,24.575123,26.039355,25.670397
2023-05-03 05:15:00,116.74,5,22.189748,22.031662,22.443350,0.000090,0.000555,0.003285,27.958344,30.043552,27.913478
2023-05-03 05:30:00,118.65,5,21.922418,21.670921,22.125737,0.000090,0.000555,0.003285,27.753713,29.561067,27.690508
2023-05-03 05:45:00,114.99,5,21.797758,21.493868,21.963489,0.000091,0.000555,0.003283,27.896575,29.616969,27.836425
2023-05-03 06:00:00,130.32,6,21.639051,21.292527,21.783249,0.000092,0.000554,0.003282,27.996461,29.664269,27.950468
...,...,...,...,...,...,...,...,...,...,...,...
2023-07-13 22:45:00,121.01,22,39.866070,39.333752,39.643206,0.000079,0.000682,0.004465,16.617879,16.979914,16.872741
2023-07-13 23:00:00,124.69,23,39.795682,39.257100,39.569846,0.000080,0.000682,0.004452,16.610960,16.972853,16.870894
2023-07-13 23:15:00,125.98,23,39.671685,39.125970,39.452524,0.000080,0.000682,0.004440,16.863582,17.251447,17.096013
2023-07-13 23:30:00,122.12,23,39.481918,38.925144,39.271220,0.000081,0.000682,0.004428,17.417485,17.879763,17.631672


In [7]:
X_scen_dic['Psychology']['Trees_Light']

Unnamed: 0_level_0,KW,Hour,AirT_Top,AirT_North,AirT_East,AirP_Top,AirP_North,AirP_East,RelH_Top,RelH_North,RelH_East
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-07-07 05:00:00,120.06,5,33.192825,31.752097,32.200205,0.000072,0.000420,0.002837,16.547022,18.744312,18.030808
2023-07-07 05:15:00,122.69,5,31.536327,30.429439,31.106952,0.000074,0.000415,0.002714,19.530020,22.060448,20.256823
2023-07-07 05:30:00,129.83,5,31.384425,30.225176,30.945300,0.000075,0.000414,0.002696,19.229157,21.555420,19.904036
2023-07-07 05:45:00,119.98,5,31.265410,30.067066,30.822145,0.000075,0.000414,0.002680,19.115522,21.368934,19.766133
2023-07-07 06:00:00,133.21,6,31.142929,29.939922,30.712110,0.000075,0.000413,0.002665,18.997878,21.179887,19.626329
...,...,...,...,...,...,...,...,...,...,...,...
2023-07-07 22:30:00,124.97,22,37.696020,36.644255,37.174315,0.000033,0.000296,0.002086,10.717249,11.505262,11.073083
2023-07-07 22:45:00,122.86,22,37.529240,36.458664,37.005471,0.000034,0.000296,0.002075,10.673709,11.457724,11.036489
2023-07-07 23:00:00,123.10,23,37.365820,36.283051,36.842097,0.000034,0.000295,0.002065,10.609534,11.384406,10.977022
2023-07-07 23:15:00,119.67,23,37.218550,36.117061,36.688579,0.000034,0.000295,0.002055,10.576805,11.348026,10.942418


# Choosing the Right Method:

**Problem Specificity:** The best feature selection method depends on your specific problem, the nature of your data, and the type of model you're building. For predictive modeling where accuracy is paramount, and multicollinearity is not a concern, model-based methods like feature importances or permutation importance can be very effective.

**Exploration:** Often, the most successful approach involves trying multiple feature selection methods and comparing their impact on model performance. This exploratory phase can also offer insights into the data and how different features influence the prediction.

**Computational Resources:** Some methods, especially wrapper methods like RFE, can be computationally intensive. Consider your available computational resources when choosing an approach.

Given the variety of methods at your disposal, starting with the ones you mentioned should be adequate for most models. However, incorporating additional techniques like mutual information or Boruta can further enhance your feature selection process, especially if you seek to understand the underlying data structure or ensure that you're not missing important features that could improve model performance.

# Sklearn Methods

**Implementation Tips:
Cross-Validation: Always use cross-validation to evaluate feature selection methods to ensure that the selected features generalize well to unseen data.
Experiment: There is no one-size-fits-all method for feature selection. It's often beneficial to experiment with multiple approaches and compare their performance on a validation set or via cross-validation.
By focusing on these approaches, you can iteratively refine your feature set to improve model accuracy without worrying about multicollinearity.**

## 1. Wrapper Methods Recursive Feature Elimination (RFE):
This method fits a model and removes the weakest feature (or features) until the specified number of features is reached. With cross-validation (RFECV), it can find the optimal number of features that maximize the model performance.

In [8]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [9]:
dfs['Baseline'][0].columns

Index(['Year', 'Month', 'Day', 'Hour', 'Minute', 'Building', 'KW', 'KW/SQM',
       'CHWTON', 'CHWTON/SQM', 'HTmmBTU', 'HTmmBTU/SQM', 'AirT_Top',
       'AirT_North', 'AirT_East', 'AirT_South', 'AirT_West', 'AirT_Mean',
       'RelH_Top', 'RelH_North', 'RelH_East', 'RelH_South', 'RelH_West',
       'RelH_Mean', 'AbsH_Top', 'AbsH_North', 'AbsH_East', 'AbsH_South',
       'AbsH_West', 'AbsH_Mean', 'Wind_Top', 'Wind_North', 'Wind_East',
       'Wind_South', 'Wind_West', 'Wind_Mean', 'AirP_Top', 'AirP_North',
       'AirP_East', 'AirP_South', 'AirP_West', 'ShortW_Top', 'ShortW_North',
       'ShortW_East', 'ShortW_South', 'ShortW_West', 'LongW_Top',
       'LongW_North', 'LongW_East', 'LongW_South', 'LongW_West', 'RadT_Top',
       'RadT_North', 'RadT_East', 'RadT_South', 'RadT_West', 'Shade_Top',
       'Shade_North', 'Shade_East', 'Shade_South', 'Shade_West'],
      dtype='object')

In [11]:
## Get X and y data for X:[KW, HT, AirT] variables for all facades
add_lists(X_vars, [KW, HT, AirT])
y_var='CHWTON'
X_vars=['Hour','AirT_Top','RelH_Top']
X_base_dic,y_base_dic,X_scen_dic,y_scen_dic = X_y_data(X_vars, y_var)

estimator = RandomForestRegressor()
selector = RFECV(estimator, step=1, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')

## Perform for each building and save results

RFE_dic = {key: None for key in bld_list}
for b in bld_list:
    X = X_base_dic[b]
    y = y_base_dic[b]
    selector = selector.fit(X, y)
    X_selected = selector.transform(X)
    RFE_dic[b] = X_selected
    


NameError: name 'add_lists' is not defined

In [10]:
RFE_dic['Psychology']

NameError: name 'RFE_dic' is not defined

## 2. Embedded Methods
Feature Importance from Tree-based Models: Models like Random Forest, Extra Trees, and Gradient Boosting can inherently provide feature importances based on how each feature contributes to the model.

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X, y)
importances = model.feature_importances_

# Select features based on importance threshold
indices = np.argsort(importances)[::-1]
selected_features = [X.columns[i] for i in indices if importances[i] > threshold]  # Define your threshold


## 3. Model-Agnostic Methods
Permutation Feature Importance: This technique involves randomly shuffling individual features and measuring the change in the model's performance. Features that significantly decrease model performance when shuffled are considered important.

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(model, X, y, n_repeats=10)
perm_sorted_idx = result.importances_mean.argsort()

X_selected = X[X.columns[perm_sorted_idx]]  # Adjust based on importance threshold or top N features


4. Algorithm-Specific Techniques
Lasso Regression (L1 Regularization): For linear models, Lasso can shrink some coefficients to zero, effectively performing feature selection.

In [None]:
from sklearn.linear_model import LassoCV

model = LassoCV(cv=5).fit(X, y)
importance = np.abs(model.coef_)
print(importance)


# High-level Libraries for Feature Selection

**Several Python libraries offer sophisticated tools and methods for feature selection, simplifying the process by providing high-level APIs. These can be particularly useful in automating the selection process, handling multicollinearity, and improving model accuracy. Here are details on three notable libraries:**

## 1. Feature-engine  
Feature-engine is a feature selection library that offers a wide array of techniques, including selection methods based on statistical tests, feature importance, and correlation strategies.

        Installation:
           pip install feature-engine
        
        Key Features:
           DropCorrelatedFeatures: Removes correlated features.
           SelectByShuffling: Evaluates feature importance through shuffling.
           SmartCorrelatedSelection: Selects a representative from a group of correlated features based on performance metrics.

In [None]:
from feature_engine.selection import DropCorrelatedFeatures

# Initialize the selector
sel = DropCorrelatedFeatures(variables=None, method="pearson", threshold=0.8)

# Fit the selector
sel.fit(X_train)

# Transform the data
X_train_selected = sel.transform(X_train)
X_test_selected = sel.transform(X_test)


## 2. BorutaPy  
An all-relevant feature selection method that uses random forests. Boruta tries to find all features carrying information usable for prediction, thus providing a more comprehensive set.

        Installation: 
            pip install Boruta
        
        Key Features:
            Works with any regressor that supports the feature_importances_ or coef_ attribute.
            Employs a statistical test to decide on the importance of features.

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
boruta = BorutaPy(forest, n_estimators='auto', verbose=2, random_state=1)
boruta.fit(X.values, y.values)


## 3. Scikit-learn  
Scikit-learn itself provides a comprehensive suite of feature selection methods, including recursive feature elimination, feature selection based on importance, and univariate statistical tests.

        Key Methods:
            SelectFromModel: Meta-transformer for selecting features based on importance weights.
            RFECV: Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# Initialize the model
logistic = LogisticRegression()

# Initialize SelectFromModel
sel = SelectFromModel(logistic)

# Fit the selector
sel.fit(X_train, y_train)

# Transform the dataset
X_train_selected = sel.transform(X_train)
X_test_selected = sel.transform(X_test)


# Additional Techniques Worth Considering

## 1. Mutual Information 
A filter method that measures the dependency between variables. Unlike correlation, mutual information can capture any kind of relationship, not just linear ones.

In [None]:
from sklearn.feature_selection import mutual_info_regression

mutual_info = mutual_info_regression(X, y)


## 2. Sequential Feature Selection
This is another wrapper method that adds or removes features to form a feature subset in a greedy fashion. SequentialFeatureSelector from scikit-learn can be used for both forward selection and backward elimination.

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=10, direction='forward').fit(X, y)
