In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings

# Scikit-learn imports
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt

warnings.filterwarnings('ignore')


def transform_data_with_columntransformer(df: pd.DataFrame):
    """
    This function takes a pandas DataFrame, applies a series of transformations
    using ColumnTransformer, and returns features (X) and target variables (y).
    """

    df['data_received_on'] = pd.to_datetime(df['data_received_on'])
    df['data_received_on_naive'] = df['data_received_on'].dt.tz_localize(None)

    df.sort_values('data_received_on_naive', inplace=True)

    converted_df = df.pivot_table(
        index=['data_received_on_naive', 'site', 'system_type'],
        columns='datapoint',
        values='monitoring_data',
        aggfunc='first'
    )
    converted_df.reset_index(inplace=True)

    numeric_cols = [
        "RA Damper feedback", "SA Pressure setpoint", "OA Humid", "RA Temp",
        "RA CO2", "RA CO2 setpoint", "SA Fan Speed feedback", "SA Fan Speed control",
        "RA Temp control( Valve Feedback)", "SA pressure", "Fan Power meter (KW)",
        "RA damper control", "OA Temp", "OA Flow", "SA temp", "RA  temperature setpoint"
    ]
    present_numeric_cols = [col for col in numeric_cols if col in converted_df.columns]
    converted_df[present_numeric_cols] = converted_df[present_numeric_cols].apply(pd.to_numeric, errors='coerce')

    cols_to_drop = [
        'site', 'system_type', 'Bag filter dirty status', 'Plant enable',
        'Trip status', 'airflow Status', 'auto Status', 'pre Filter dirty staus'
    ]
    converted_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    if "Sup fan cmd" in converted_df.columns:
        mappings = {'active': 1, 'inactive': 0}
        converted_df["Sup fan cmd"] = converted_df["Sup fan cmd"].replace(mappings)

    target_columns = ["RA damper control", "RA Temp control( Valve Feedback)", "SA Fan Speed control", "Fan Power meter (KW)"]
    present_target_cols = [col for col in target_columns if col in converted_df.columns]
    
    converted_df.dropna(subset=present_target_cols, inplace=True)

    y = converted_df[present_target_cols]
    X = converted_df.drop(columns=present_target_cols, errors='ignore')

    converted_df['hour'] = converted_df['data_received_on_naive'].dt.hour
    converted_df['dayofweek'] = converted_df['data_received_on_naive'].dt.dayofweek
    converted_df['month'] = converted_df['data_received_on_naive'].dt.month
    converted_df['dayofyear'] = converted_df['data_received_on_naive'].dt.dayofyear
    X = converted_df.drop(columns=['data_received_on_naive', 'data_received_on'], errors='ignore')
    
    numeric_features = [col for col in X.columns if pd.api.types.is_numeric_dtype(X[col]) and col != 'Sup fan cmd']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numeric_features)
        ],
        remainder='passthrough'  
    )

    X_transformed = preprocessor.fit_transform(X)


    passthrough_cols = [col for col in X.columns if col not in numeric_features]
    transformed_cols = numeric_features + passthrough_cols
    X = pd.DataFrame(X_transformed, columns=transformed_cols, index=X.index)


    combined = pd.concat([X, y], axis=1)
    combined.dropna(inplace=True)
    X = combined[X.columns]
    y = combined[y.columns]


    return X, y,preprocessor

def load_and_preprocess_data(df : pd.DataFrame = None, file_path: str = 'C:/Users/debas/OneDrive/Desktop/output.csv'):
    """
    Loads data, preprocesses it, and splits it into training and testing sets.
    """
    print("--- Loading and Preprocessing Data ---")
    if not df:
        try:
            df = pd.read_csv(file_path)
        except FileNotFoundError:
            print(f"Error: '{file_path}' not found. Please ensure the dataset is in the correct directory.")
            exit()
    
    
    print(f"Initial data shape: {df.shape}")    
    df = df[(df['site'] == "Ground Floor") & (df['system_type'] == "AHU")]
    
    X, y, preprocessor = transform_data_with_columntransformer(df)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values
    
    print(f"Training data shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Testing data shape: X={X_test.shape}, y={y_test.shape}")
    
    return X_train, X_test, y_train, y_test, preprocessor

# --- 2. MODEL TRAINING AND TUNING ---

def tune_sklearn_models(X_train, y_train):
    """
    Defines and tunes scikit-learn models using RandomizedSearchCV.
    """
    print("\n--- Starting Training for Scikit-learn Models ---")
    models_to_tune = {
        # 'RandomForest': {
        #     'estimator': RandomForestRegressor(random_state=42),
        #     'params': {
        #         'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30, None],
        #         'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]
        #     }
        # },
        # 'GradientBoosting': {
        #     'estimator': GradientBoostingRegressor(random_state=42),
        #     'params': {
        #         'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]
        #     }
        # },
        'XGBoost': {
            'estimator': xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
            'params': {
                'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7], 'colsample_bytree': [0.7, 0.8, 1.0],
            }
        }
    }

    best_sklearn_models = {}
    for name, config in models_to_tune.items():
        print(f"\nTuning {name}...")
        random_search = RandomizedSearchCV(
            estimator=config['estimator'], param_distributions=config['params'],
            n_iter=10, cv=3, verbose=1, random_state=42, n_jobs=-1
        )
        search_wrapper = MultiOutputRegressor(random_search)
        search_wrapper.fit(X_train, y_train)
        best_sklearn_models[name] = search_wrapper
        print(f"Finished tuning {name}.")
    
    return best_sklearn_models

def create_keras_model_builder(input_shape, output_shape):
    """
    Factory function to create the Keras model builder with specific input/output shapes.
    """
    def build_model(hp):
        inputs = keras.Input(shape=(input_shape,))
        x = inputs
        for i in range(hp.Int('num_layers', 1, 3)):
            x = layers.Dense(
                units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
                activation=hp.Choice('activation', ['relu', 'tanh'])
            )(x)
            x = layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1))(x)
        outputs = layers.Dense(output_shape)(x)
        model = keras.Model(inputs=inputs, outputs=outputs)
        learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
            loss="mean_squared_error",
            metrics=["mean_absolute_error"],
        )
        return model
    return build_model

def tune_keras_model(X_train, y_train):
    """
    Tunes and trains a deep learning model using Keras Tuner.
    """
    print("\n--- Starting Training for Deep Learning Model ---")
    model_builder = create_keras_model_builder(X_train.shape[1], y_train.shape[1])
    tuner = kt.RandomSearch(
        model_builder, objective='val_loss', max_trials=10, executions_per_trial=2,
        directory='keras_tuner_dir', project_name='multi_output_regression'
    )
    tuner.search_space_summary()
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    print("\nRunning Keras Tuner search...")
    tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[early_stopping], verbose=1)
    
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    keras_model = tuner.get_best_models(num_models=1)[0]
    print(f"\nOptimal Keras hyperparameters found: Learning Rate={best_hps.get('lr'):.4f}")
    return keras_model

# --- 3. MODEL EVALUATION AND SAVING ---

def evaluate_models(models: dict, X_test, y_test):
    """
    Evaluates a dictionary of trained models on the test set and returns a results DataFrame.
    """
    print("\n--- Evaluating All Models on Test Set ---")
    evaluation_results = {}
    for name, model in models.items():
        predictions = model.predict(X_test)
        mae = mean_absolute_error(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        evaluation_results[name] = {'MAE': mae, 'MSE': mse, 'R2 Score': r2}
        print(f"\n{name} Metrics: MAE={mae:.4f}, MSE={mse:.4f}, R2 Score={r2:.4f}")
        
    results_df = pd.DataFrame(evaluation_results).T
    print("\n--- Model Comparison ---")
    print(results_df)
    return results_df

def save_best_model(best_model_name, best_model, preprocessor):
    """
    Saves the best performing model and its preprocessor to disk.
    """
    print(f"\nBest performing model is: {best_model_name}")
    if 'Keras' in best_model_name:
        best_model.save('best_model.keras')
        joblib.dump(preprocessor, 'preprocessor.joblib')
        print("Saved Keras model to 'best_model.keras' and preprocessor to 'preprocessor.joblib'")
    else:
        pipeline_to_save = {'preprocessor': preprocessor, 'model': best_model}
        joblib.dump(pipeline_to_save, f'best_model_{best_model_name}.joblib')
        print(f"Saved complete pipeline to 'best_model_{best_model_name}.joblib'")

# --- 4. MAIN ORCHESTRATOR ---

def main(data_file_path: str='', df: pd.DataFrame = None):
    """
    Main function to run the entire training pipeline.
    """
    # Step 1: Load and preprocess data
    X_train, X_test, y_train, y_test, preprocessor = load_and_preprocess_data(data_file_path,df=df)

    # Step 2: Train and tune models
    best_sklearn_models = tune_sklearn_models(X_train, y_train)
    keras_model = tune_keras_model(X_train, y_train)

    # Step 3: Evaluate models
    all_models = {**best_sklearn_models, 'Keras_Functional_API': keras_model}
    results_df = evaluate_models(all_models, X_test, y_test)

    # Step 4: Save the best model
    best_model_name = results_df['R2 Score'].idxmax()
    best_model = all_models[best_model_name]
    save_best_model(best_model_name, best_model, preprocessor)



In [6]:
df = pd.read_csv('C:/Users/debas/OneDrive/Desktop/output.csv')

In [8]:
# # Define the path to your dataset
# DATA_FILE_PATH = 'C:/Users/debas/OneDrive/Desktop/output.csv'
# main(df=df)

In [9]:
X_train.info()

AttributeError: 'numpy.ndarray' object has no attribute 'info'

In [None]:
X_train, X_test, y_train, y_test, preprocessor = load_and_preprocess_data(df=df,file_path=DATA_FILE_PATH)

--- Loading and Preprocessing Data ---


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [10]:
df

Unnamed: 0,service_id,asset_code,site,system_type,device_id,device_ip,object_name,equipment_name,equipment_id,data_received_on,datapoint,monitoring_data,service_status,subsystem,system_id
0,FIR-FCU-01On/Off Cmd,FIR-FCU-01,First Floor,FCU,170101,192.168.170.101,First Floor\\FCU-1F-02 On/Off Cmd,FCU-1F-02,FCU-1F-02,2025-08-21T08:11:48.133 UTC,On/Off Cmd,active,normal,-,-
1,FIR-FCU-01On/Off Cmd,FIR-FCU-01,First Floor,FCU,170101,192.168.170.101,First Floor\\FCU-1F-02 On/Off Cmd,FCU-1F-02,FCU-1F-02,2025-08-21T08:08:47.990 UTC,On/Off Cmd,active,normal,-,-
2,FIR-FCU-01On/Off Cmd,FIR-FCU-01,First Floor,FCU,170101,192.168.170.101,First Floor\\FCU-1F-02 On/Off Cmd,FCU-1F-02,FCU-1F-02,2025-08-21T08:05:48.796 UTC,On/Off Cmd,active,normal,-,-
3,FIR-FCU-01On/Off Cmd,FIR-FCU-01,First Floor,FCU,170101,192.168.170.101,First Floor\\FCU-1F-02 On/Off Cmd,FCU-1F-02,FCU-1F-02,2025-08-21T08:02:48.708 UTC,On/Off Cmd,active,normal,-,-
4,FIR-FCU-01On/Off Cmd,FIR-FCU-01,First Floor,FCU,170101,192.168.170.101,First Floor\\FCU-1F-02 On/Off Cmd,FCU-1F-02,FCU-1F-02,2025-08-21T07:59:48.964 UTC,On/Off Cmd,active,normal,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14428292,GRO-FCU-05Ret temp,GRO-FCU-05,Ground Floor,FCU,170101,192.168.170.101,Ground Floor\\FCU\\FCU-GF-05 Ret temp,FCU-GF-05,FCU-GF-05,2025-05-23T12:57:34.673 UTC,Ret temp,41.72909164428711,{},-,-
14428293,GRO-FCU-05Ret temp,GRO-FCU-05,Ground Floor,FCU,170101,192.168.170.101,Ground Floor\\FCU\\FCU-GF-05 Ret temp,FCU-GF-05,FCU-GF-05,2025-05-23T12:54:28.494 UTC,Ret temp,40.109561920166016,{},-,-
14428294,ROO-CHI-63Secondary pump,ROO-CHI-63,Rooftop,Chiller system,170101,192.168.170.101,Rooftop\\Chiller system\\Pumps\\Secondary pump...,Pumps,Pumps,2025-05-23T13:03:40.368 UTC,Secondary pump,inactive,{},-,-
14428295,ROO-CHI-63Secondary pump,ROO-CHI-63,Rooftop,Chiller system,170101,192.168.170.101,Rooftop\\Chiller system\\Pumps\\Secondary pump...,Pumps,Pumps,2025-05-23T13:00:38.727 UTC,Secondary pump,inactive,{},-,-


In [None]:
X_train[0]

array([ 0.87157246, 10.01367664,  0.        ,  0.0954916 ,  0.0954916 ,
        0.        ,  0.75      ,  1.        ,  0.        ,  0.16543484,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.09090909,  0.75      ,  0.1654348 ,
        0.56521739,  0.83333333,  0.        ,  0.08888889,  0.        ])

In [None]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

In [None]:
params= {
                'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7], 'colsample_bytree': [0.7, 0.8, 1.0],
            }
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42,**params)
model.fit(X_train, y_train)

TypeError: 'list' object cannot be interpreted as an integer

In [11]:
def load_and_preprocess_data(df : pd.DataFrame = None, file_path: str = 'C:/Users/debas/OneDrive/Desktop/output.csv'):
    """
    Loads data, preprocesses it, and splits it into training and testing sets.
    """
    print("--- Loading and Preprocessing Data ---")
    if df is None:
        if file_path is None:
            raise ValueError("Either 'df' or 'file_path' must be provided.")
        df = pd.read_csv(file_path)
    
    
    print(f"Initial data shape: {df.shape}")    
    df = df[(df['site'] == "Ground Floor") & (df['system_type'] == "AHU")]
    
    X, y, preprocessor = transform_data_with_columntransformer(df)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values
    
    print(f"Training data shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Testing data shape: X={X_test.shape}, y={y_test.shape}")
    
    return X_train, X_test, y_train, y_test, preprocessor

In [40]:
def transform_data_with_columntransformer(df: pd.DataFrame):
    """
    This function takes a pandas DataFrame, applies a series of transformations
    using ColumnTransformer, and returns features (X) and target variables (y).
    """

    df['data_received_on'] = pd.to_datetime(df['data_received_on'])
    df['data_received_on_naive'] = df['data_received_on'].dt.tz_localize(None)

    df.sort_values('data_received_on_naive', inplace=True)

    converted_df = df.pivot_table(
        index=['data_received_on_naive', 'site', 'system_type'],
        columns='datapoint',
        values='monitoring_data',
        aggfunc='first'
    )
    converted_df.reset_index(inplace=True)

    numeric_cols = [
        "RA Damper feedback", "SA Pressure setpoint", "OA Humid", "RA Temp",
        "RA CO2", "RA CO2 setpoint", "SA Fan Speed feedback", "SA Fan Speed control",
        "RA Temp control( Valve Feedback)", "SA pressure", "Fan Power meter (KW)",
        "RA damper control", "OA Temp", "OA Flow", "SA temp", "RA  temperature setpoint"
    ]
    present_numeric_cols = [col for col in numeric_cols if col in converted_df.columns]
    converted_df[present_numeric_cols] = converted_df[present_numeric_cols].apply(pd.to_numeric, errors='coerce')

    cols_to_drop = [
        'site', 'system_type', 'Bag filter dirty status', 'Plant enable',
        'Trip status', 'airflow Status', 'auto Status', 'pre Filter dirty staus'
    ]
    converted_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    if "Sup fan cmd" in converted_df.columns:
        mappings = {'active': 1, 'inactive': 0}
        converted_df["Sup fan cmd"] = converted_df["Sup fan cmd"].replace(mappings)

    target_columns = ["RA damper control", "RA Temp control( Valve Feedback)", "SA Fan Speed control", "Fan Power meter (KW)"]
    present_target_cols = [col for col in target_columns if col in converted_df.columns]
    
    converted_df.dropna(subset=present_target_cols, inplace=True)
    
    y = converted_df[present_target_cols]
    #X = converted_df.drop(columns=present_target_cols, errors='ignore')
    
    print(f"y : {present_target_cols}")
    print(f"y shape : {y.shape}")

    converted_df['hour'] = converted_df['data_received_on_naive'].dt.hour
    converted_df['dayofweek'] = converted_df['data_received_on_naive'].dt.dayofweek
    converted_df['month'] = converted_df['data_received_on_naive'].dt.month
    converted_df['dayofyear'] = converted_df['data_received_on_naive'].dt.dayofyear
    X = converted_df.drop(columns=['data_received_on_naive', 'data_received_on']+present_target_cols, errors='ignore')
    
    numeric_features = [col for col in X.columns if pd.api.types.is_numeric_dtype(X[col]) and col != 'Sup fan cmd']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numeric_features)
        ],
        remainder='passthrough'  
    )

    X_transformed = preprocessor.fit_transform(X)


    passthrough_cols = [col for col in X.columns if col not in numeric_features]
    transformed_cols = numeric_features + passthrough_cols
    X = pd.DataFrame(X_transformed, columns=transformed_cols, index=X.index)


    combined = pd.concat([X, y], axis=1)
    combined.dropna(inplace=True)
    X = combined[X.columns]
    
    print(f"y cols : {y.columns}")
    print(f"combined cols : {combined.columns}")
    y = combined[y.columns]

    
    return X, y,preprocessor

In [13]:
def transform_dataframe(df: pd.DataFrame):
    """
    Helper function to apply the core data transformation logic.
    """
    print("Starting data transformation...")
    # 1. Handle timestamps
    df['data_received_on'] = pd.to_datetime(df['data_received_on'])
    df['data_received_on_naive'] = df['data_received_on'].dt.tz_localize(None)
    df.sort_values('data_received_on_naive', inplace=True)

    # 2. Pivot the table
    converted_df = df.pivot_table(
        index=['data_received_on_naive'],
        columns='datapoint',
        values='monitoring_data',
        aggfunc='first'
    )
    converted_df.reset_index(inplace=True)

    # 3. Convert data types
    numeric_cols = [
        "RA Damper feedback", "SA Pressure setpoint", "OA Humid", "RA Temp",
        "RA CO2", "RA CO2 setpoint", "SA Fan Speed feedback", "SA Fan Speed control",
        "RA Temp control( Valve Feedback)", "SA pressure", "Fan Power meter (KW)",
        "RA damper control", "OA Temp", "OA Flow", "SA temp", "RA  temperature setpoint"
    ]
    present_numeric_cols = [col for col in numeric_cols if col in converted_df.columns]
    converted_df[present_numeric_cols] = converted_df[present_numeric_cols].apply(pd.to_numeric, errors='coerce')

    # 4. Map categorical features and handle missing values
    if "Sup fan cmd" in converted_df.columns:
        mappings = {'active': 1, 'inactive': 0}
        converted_df["Sup fan cmd"] = converted_df["Sup fan cmd"].replace(mappings).fillna(0)



    # 5. Engineer time-based features from the timestamp
    print("Engineering time-based features...")
    converted_df['hour'] = converted_df['data_received_on_naive'].dt.hour
    converted_df['dayofweek'] = converted_df['data_received_on_naive'].dt.dayofweek
    converted_df['month'] = converted_df['data_received_on_naive'].dt.month
    converted_df['dayofyear'] = converted_df['data_received_on_naive'].dt.dayofyear

    # 6. Define target columns and clean data
    target_columns = ["RA damper control", "RA Temp control( Valve Feedback)", "SA Fan Speed control", "Fan Power meter (KW)"]
    present_target_cols = [col for col in target_columns if col in converted_df.columns]

    # Drop rows where targets are missing, then fill remaining NaNs
    converted_df.dropna(subset=present_target_cols, inplace=True)
    converted_df.fillna(method='ffill', inplace=True)
    converted_df.fillna(method='bfill', inplace=True)

    # 7. Create final X and y AFTER all engineering and cleaning
    y = converted_df[present_target_cols]
    # Drop original timestamp and targets to create a clean feature set
    X = converted_df.drop(columns=present_target_cols + ['data_received_on_naive'], errors='ignore')

    # 8. Create ColumnTransformer for scaling
    encoded_categorical_features = []
    if 'Sup fan cmd' in X.columns:
        encoded_categorical_features.append('Sup fan cmd')

    numeric_features = [
        col for col in X.columns 
        if pd.api.types.is_numeric_dtype(X[col]) and col not in encoded_categorical_features
    ]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numeric_features),
            ('cat', 'passthrough', encoded_categorical_features)
        ],
        remainder='drop'
    )

    X_transformed = preprocessor.fit_transform(X)
    
    transformed_cols = numeric_features + encoded_categorical_features
    X = pd.DataFrame(X_transformed, columns=transformed_cols, index=X.index)

    print("Data transformation complete.")
    # The final alignment/dropna step is no longer needed because of the improved logic
    return X, y, preprocessor

In [14]:
df = df[(df['site'] == "Ground Floor") & (df['system_type'] == "AHU")]
    
#

In [15]:
X,y,preprocessor = transform_dataframe(df)

Starting data transformation...
Engineering time-based features...
Data transformation complete.


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
y_train

datapoint,RA damper control,RA Temp control( Valve Feedback),SA Fan Speed control,Fan Power meter (KW)
374,0.0,0.000000,0.0,9.303454
3643,0.0,0.000000,0.0,10.013677
29561,10.0,94.150986,100.0,7.792364
27503,10.0,95.241127,100.0,7.203832
25909,10.0,95.429337,100.0,10.152745
...,...,...,...,...
16852,10.0,7.170233,100.0,6.345133
6265,0.0,0.000000,0.0,9.500352
11284,0.0,0.000000,0.0,6.345133
860,0.0,0.000000,0.0,7.032096


In [None]:
X_train, X_test, y_train, y_test, preprocessor =load_and_preprocess_data(df=df,file_path=DATA_FILE_PATH)

--- Loading and Preprocessing Data ---
Initial data shape: (14428297, 15)
Training data shape: X=(28856, 25), y=(28856, 8)
Testing data shape: X=(7215, 25), y=(7215, 8)


In [None]:
y_train[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.87157246, 10.01367664])

In [21]:
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

multi_output_model = MultiOutputRegressor(estimator=xgb_regressor, n_jobs=-1)

print("\nTraining the MultiOutputRegressor with XGBoost...")
multi_output_model.fit(X_train, y_train)
print("Training complete.")


Training the MultiOutputRegressor with XGBoost...
Training complete.


In [23]:
y_pred = multi_output_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error on the test set: {mse:.4f}")


Mean Squared Error on the test set: 1.4658


In [24]:
print(f"\nNumber of individual estimators (models) trained: {len(multi_output_model.estimators_)}")
print(f"Shape of predictions: {y_pred.shape}")


Number of individual estimators (models) trained: 4
Shape of predictions: (7215, 4)


In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt

warnings.filterwarnings('ignore')


def transform_data_with_columntransformer_old(df: pd.DataFrame):
    

    """
    This function takes a pandas DataFrame, applies a series of transformations
    using ColumnTransformer, and returns features (X) and target variables (y).
    """

    df['data_received_on'] = pd.to_datetime(df['data_received_on'])
    df['data_received_on_naive'] = df['data_received_on'].dt.tz_localize(None)

    df.sort_values('data_received_on_naive', inplace=True)

    converted_df = df.pivot_table(
        index=['data_received_on_naive', 'site', 'system_type'],
        columns='datapoint',
        values='monitoring_data',
        aggfunc='first'
    )
    converted_df.reset_index(inplace=True)

    numeric_cols = [
        "RA Damper feedback", "SA Pressure setpoint", "OA Humid", "RA Temp",
        "RA CO2", "RA CO2 setpoint", "SA Fan Speed feedback", "SA Fan Speed control",
        "RA Temp control( Valve Feedback)", "SA pressure", "Fan Power meter (KW)",
        "RA damper control", "OA Temp", "OA Flow", "SA temp", "RA  temperature setpoint"
    ]
    present_numeric_cols = [col for col in numeric_cols if col in converted_df.columns]
    converted_df[present_numeric_cols] = converted_df[present_numeric_cols].apply(pd.to_numeric, errors='coerce')

    cols_to_drop = [
        'site', 'system_type', 'Bag filter dirty status', 'Plant enable',
        'Trip status', 'airflow Status', 'auto Status', 'pre Filter dirty staus'
    ]
    converted_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    if "Sup fan cmd" in converted_df.columns:
        mappings = {'active': 1, 'inactive': 0}
        converted_df["Sup fan cmd"] = converted_df["Sup fan cmd"].replace(mappings)

    target_columns = ["RA damper control", "RA Temp control( Valve Feedback)", "SA Fan Speed control", "Fan Power meter (KW)"]
    present_target_cols = [col for col in target_columns if col in converted_df.columns]
    
    converted_df.dropna(subset=present_target_cols, inplace=True)

    y = converted_df[present_target_cols]
    X = converted_df.drop(columns=present_target_cols, errors='ignore')

    converted_df['hour'] = converted_df['data_received_on_naive'].dt.hour
    converted_df['dayofweek'] = converted_df['data_received_on_naive'].dt.dayofweek
    converted_df['month'] = converted_df['data_received_on_naive'].dt.month
    converted_df['dayofyear'] = converted_df['data_received_on_naive'].dt.dayofyear
    X = converted_df.drop(columns=['data_received_on_naive', 'data_received_on'], errors='ignore')
    
    numeric_features = [col for col in X.columns if pd.api.types.is_numeric_dtype(X[col]) and col != 'Sup fan cmd']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numeric_features)
        ],
        remainder='passthrough'  
    )

    X_transformed = preprocessor.fit_transform(X)


    passthrough_cols = [col for col in X.columns if col not in numeric_features]
    transformed_cols = numeric_features + passthrough_cols
    X = pd.DataFrame(X_transformed, columns=transformed_cols, index=X.index)


    combined = pd.concat([X, y], axis=1)
    combined.dropna(inplace=True)
    X = combined[X.columns]
    y = combined[y.columns]


    return X, y,preprocessor

def transform_dataframe(df: pd.DataFrame):
    print(f"{2:-^50}")
    """
    Helper function to apply the core data transformation logic.
    """
    print("Starting data transformation...")
    df['data_received_on'] = pd.to_datetime(df['data_received_on'])
    df['data_received_on_naive'] = df['data_received_on'].dt.tz_localize(None)
    df.sort_values('data_received_on_naive', inplace=True)

    converted_df = df.pivot_table(
        index=['data_received_on_naive'],
        columns='datapoint',
        values='monitoring_data',
        aggfunc='first'
    )
    converted_df.reset_index(inplace=True)

    numeric_cols = [
        "RA Damper feedback", "SA Pressure setpoint", "OA Humid", "RA Temp",
        "RA CO2", "RA CO2 setpoint", "SA Fan Speed feedback", "SA Fan Speed control",
        "RA Temp control( Valve Feedback)", "SA pressure", "Fan Power meter (KW)",
        "RA damper control", "OA Temp", "OA Flow", "SA temp", "RA  temperature setpoint"
    ]
    present_numeric_cols = [col for col in numeric_cols if col in converted_df.columns]
    converted_df[present_numeric_cols] = converted_df[present_numeric_cols].apply(pd.to_numeric, errors='coerce')

    if "Sup fan cmd" in converted_df.columns:
        mappings = {'active': 1, 'inactive': 0}
        converted_df["Sup fan cmd"] = converted_df["Sup fan cmd"].replace(mappings).fillna(0)



    print("Engineering time-based features...")
    converted_df['hour'] = converted_df['data_received_on_naive'].dt.hour
    converted_df['dayofweek'] = converted_df['data_received_on_naive'].dt.dayofweek
    converted_df['month'] = converted_df['data_received_on_naive'].dt.month
    converted_df['dayofyear'] = converted_df['data_received_on_naive'].dt.dayofyear

    target_columns = ["RA damper control", "RA Temp control( Valve Feedback)", "SA Fan Speed control", "Fan Power meter (KW)"]
    present_target_cols = [col for col in target_columns if col in converted_df.columns]

    converted_df.dropna(subset=present_target_cols, inplace=True)
    converted_df.fillna(method='ffill', inplace=True)
    converted_df.fillna(method='bfill', inplace=True)

    y = converted_df[present_target_cols]
    X = converted_df.drop(columns=present_target_cols + ['data_received_on_naive'], errors='ignore')

    encoded_categorical_features = []
    if 'Sup fan cmd' in X.columns:
        encoded_categorical_features.append('Sup fan cmd')

    numeric_features = [
        col for col in X.columns 
        if pd.api.types.is_numeric_dtype(X[col]) and col not in encoded_categorical_features
    ]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numeric_features),
            ('cat', 'passthrough', encoded_categorical_features)
        ],
        remainder='drop'
    )

    X_transformed = preprocessor.fit_transform(X)
    
    transformed_cols = numeric_features + encoded_categorical_features
    X = pd.DataFrame(X_transformed, columns=transformed_cols, index=X.index)

    print("Data transformation complete.")
    return X, y, preprocessor



def load_and_preprocess_data(df : pd.DataFrame = None, file_path: str = 'C:/Users/debas/OneDrive/Desktop/output.csv'):
    print(f"{1:-^50}")
    """
    Loads data, preprocesses it, and splits it into training and testing sets.
    """
    print("--- Loading and Preprocessing Data ---")
    if df is None:
        if file_path is None:
            raise ValueError("Either 'df' or 'file_path' must be provided.")
        df = pd.read_csv(file_path)
    
    
    print(f"Initial data shape: {df.shape}")    
    df = df[(df['site'] == "Ground Floor") & (df['system_type'] == "AHU")]
    
    X, y, preprocessor = transform_dataframe(df)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    
    print(f"Training data shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Testing data shape: X={X_test.shape}, y={y_test.shape}")
    
    return X_train, X_test, y_train, y_test, preprocessor


def tune_sklearn_models(X_train, y_train):
    print(f"{3:-^50}")
    """
    Defines and tunes scikit-learn models using RandomizedSearchCV.
    """
    print("\n--- Starting Training for Scikit-learn Models ---")
    models_to_tune = {
        'RandomForest': {
            'estimator': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]
            }
        },
        'GradientBoosting': {
            'estimator': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]
            }
        },
        'XGBoost': {
            'estimator': xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
            'params': {
                'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7], 'colsample_bytree': [0.7, 0.8, 1.0],
            }
        }
    }

    best_sklearn_models = {}
    for name, config in models_to_tune.items():
        print(f"\nTuning {name}...")
        random_search = RandomizedSearchCV(
            estimator=config['estimator'], param_distributions=config['params'],
            n_iter=10, cv=3, verbose=1, random_state=42, n_jobs=-1
        )
        search_wrapper = MultiOutputRegressor(random_search)
        search_wrapper.fit(X_train, y_train)
        best_sklearn_models[name] = search_wrapper
        print(f"Finished tuning {name}.")
    
    return best_sklearn_models

def create_keras_model_builder(input_shape, output_shape):
    """
    Factory function to create the Keras model builder with specific input/output shapes.
    """
    def build_model(hp):
        inputs = keras.Input(shape=(input_shape,))
        x = inputs
        for i in range(hp.Int('num_layers', 1, 3)):
            x = layers.Dense(
                units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
                activation=hp.Choice('activation', ['relu', 'tanh'])
            )(x)
            x = layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1))(x)
        outputs = layers.Dense(output_shape)(x)
        model = keras.Model(inputs=inputs, outputs=outputs)
        learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
            loss="mean_squared_error",
            metrics=["mean_absolute_error","mean_squared_error"],
        )
        return model
    return build_model

def tune_keras_model(X_train, y_train):
    print(f"{4:-^50}")
    """
    Tunes and trains a deep learning model using Keras Tuner.
    """
    print("\n--- Starting Training for Deep Learning Model ---")
    model_builder = create_keras_model_builder(X_train.shape[1], y_train.shape[1])
    tuner = kt.RandomSearch(
        model_builder, objective='val_loss', max_trials=10, executions_per_trial=2,
        directory='keras_tuner_dir', project_name='multi_output_regression'
    )
    tuner.search_space_summary()
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    print("\nRunning Keras Tuner search...")
    tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[early_stopping], verbose=0)
    
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    keras_model = tuner.get_best_models(num_models=1)[0]
    print(f"\nOptimal Keras hyperparameters found: Learning Rate={best_hps.get('lr'):.4f}")
    return keras_model


def evaluate_models(models: dict, X_test, y_test):
    print(f"{5:-^50}")
    """
    Evaluates a dictionary of trained models on the test set and returns a results DataFrame.
    """
    print("\n--- Evaluating All Models on Test Set ---")
    evaluation_results = {}
    for name, model in models.items():
        predictions = model.predict(X_test)
        mae = mean_absolute_error(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        evaluation_results[name] = {'MAE': mae, 'MSE': mse, 'R2 Score': r2}
        print(f"\n{name} Metrics: MAE={mae:.4f}, MSE={mse:.4f}, R2 Score={r2:.4f}")
        
    results_df = pd.DataFrame(evaluation_results).T
    print("\n--- Model Comparison ---")
    print(results_df)
    return results_df

def save_best_model(best_model_name, best_model, preprocessor):
    print(f"{6:-^50}")
    """
    Saves the best performing model and its preprocessor to disk.
    """
    print(f"\nBest performing model is: {best_model_name}")
    if 'Keras' in best_model_name:
        best_model.save('best_model.keras')
        joblib.dump(preprocessor, 'preprocessor.joblib')
        print("Saved Keras model to 'best_model.keras' and preprocessor to 'preprocessor.joblib'")
    else:
        pipeline_to_save = {'preprocessor': preprocessor, 'model': best_model}
        joblib.dump(pipeline_to_save, f'best_model_{best_model_name}.joblib')
        print(f"Saved complete pipeline to 'best_model_{best_model_name}.joblib'")


def main(df : pd.DataFrame = None, file_path: str = 'C:/Users/debas/OneDrive/Desktop/output.csv'):
    print(f"{0:-^50}")
    """
    Main function to run the entire training pipeline.
    """
    print("Dataframe shape before processing:", df.shape if df is not None else "No dataframe provided")
    X_train, X_test, y_train, y_test, preprocessor = load_and_preprocess_data(df=df,file_path=file_path)

    print(f"{X_train.shape=}")
    print(f"{y_train.shape=}")
    print(f"{X_test.shape=}")
    print(f"{y_test.shape=}") 
    best_sklearn_models = tune_sklearn_models(X_train, y_train)
    keras_model = tune_keras_model(X_train, y_train)

    all_models = {**best_sklearn_models, 'Keras_Functional_API': keras_model}
    results_df = evaluate_models(all_models, X_test, y_test)

    best_model_name = results_df['R2 Score'].idxmax()
    best_model = all_models[best_model_name]
    save_best_model(best_model_name, best_model, preprocessor)



In [2]:
kwargs = {
    'df': None,
    'file_path': 'C:/Users/debas/OneDrive/Desktop/output.csv'
}
main(**kwargs)

------------------------0-------------------------
Dataframe shape before processing: No dataframe provided
------------------------1-------------------------
--- Loading and Preprocessing Data ---
Initial data shape: (14428297, 15)
------------------------2-------------------------
Starting data transformation...
Engineering time-based features...
Data transformation complete.
Training data shape: X=(28860, 17), y=(28860, 4)
Testing data shape: X=(7215, 17), y=(7215, 4)
X_train.shape=(28860, 17)
y_train.shape=(28860, 4)
X_test.shape=(7215, 17)
y_test.shape=(7215, 4)
------------------------3-------------------------

--- Starting Training for Scikit-learn Models ---

Tuning RandomForest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Finished tuning RandomForest.

Tuning GradientBoosting..

In [2]:
import numpy as np
search_space = {
            'RA  temperature setpoint': np.arange(20.0, 27.5, 0.5),
            'RA CO2 setpoint': np.arange(500.0, 825.0, 25.0),
            'SA Pressure setpoint': np.arange(500.0, 1250.0, 50.0)
        }
total_combos = 1
for feature, values in search_space.items():
    total_combos *= len(values)

In [3]:
total_combos

2925

In [None]:
t