In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv(r'/content/drive/MyDrive/Fac/consumption_data.csv')
df.head()

Unnamed: 0,timestamp,totalPower_W,pvPower_W,gridDraw_W,battery_soc,lineA_speed_pct,lineB_speed_pct,lineA_mode,lineB_mode,hvac_mode,lighting_mode,prodA_uph,prodB_uph,anomaly_flag,anomaly_factor
0,2025-11-11T07:02:33.542553Z,13170.65,1144.3,11859.69,0.0,108.0,95.0,RUNNING,RUNNING,RUNNING,RUNNING,108.0,76.0,0,1.0
1,2025-11-11T07:03:33.542553Z,5341.05,1113.04,4228.01,0.0,101.0,66.0,RUNNING,IDLE,OFF,RUNNING,101.0,26.4,0,1.0
2,2025-11-11T07:04:33.542553Z,5121.75,1170.13,3951.62,0.0,98.0,0.0,RUNNING,OFF,OFF,RUNNING,98.0,0.0,0,1.0
3,2025-11-11T07:05:33.542553Z,12697.69,1169.45,11528.24,0.0,104.0,87.0,RUNNING,RUNNING,RUNNING,RUNNING,104.0,69.6,0,1.0
4,2025-11-11T07:06:33.542553Z,1807.09,1149.42,657.67,0.0,0.0,0.0,OFF,OFF,OFF,RUNNING,0.0,0.0,0,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   timestamp        10000 non-null  object 
 1   totalPower_W     10000 non-null  float64
 2   pvPower_W        10000 non-null  float64
 3   gridDraw_W       10000 non-null  float64
 4   battery_soc      10000 non-null  float64
 5   lineA_speed_pct  10000 non-null  float64
 6   lineB_speed_pct  10000 non-null  float64
 7   lineA_mode       10000 non-null  object 
 8   lineB_mode       10000 non-null  object 
 9   hvac_mode        10000 non-null  object 
 10  lighting_mode    10000 non-null  object 
 11  prodA_uph        10000 non-null  float64
 12  prodB_uph        10000 non-null  float64
 13  anomaly_flag     10000 non-null  int64  
 14  anomaly_factor   10000 non-null  float64
dtypes: float64(9), int64(1), object(5)
memory usage: 1.1+ MB


In [None]:
!pip install pandas numpy xgboost scikit-learn joblib



In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

# ==========================================
# CONFIGURATION
# ==========================================
DATA_PATH = r'/content/drive/MyDrive/Fac/factory_simulation_data.csv' # Make sure your file is named this
MODEL_DIR = r'/content/drive/MyDrive/Fac/' # Folder to save models
import os
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

def train_factory_digital_twin():
    print(f"--- 1. Loading Data from {DATA_PATH} ---")
    try:
        df = pd.read_csv(DATA_PATH)
    except FileNotFoundError:
        print("Error: File not found. Please ensure your CSV is named 'factory_simulation_data.csv'")
        return

    # Ensure timestamp is datetime and sorted (Crucial for Time Series)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp').reset_index(drop=True)

    print(f"Loaded {len(df)} rows. Time range: {df['timestamp'].min()} to {df['timestamp'].max()}")

    # ==========================================
    # 2. FEATURE ENGINEERING
    # ==========================================
    print("--- 2. Preprocessing & Feature Engineering ---")

    # A. Time Features (The heartbeat of the factory)
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['day_of_year'] = df['timestamp'].dt.dayofyear

    # B. Encode Categorical Modes
    # We must save these encoders so the chatbot understands "Turbo" = 2 later
    categorical_cols = ['lineA_mode', 'lineB_mode', 'hvac_mode', 'lighting_mode']
    encoders = {}

    for col in categorical_cols:
        le = LabelEncoder()
        # Convert to string first to be safe, then encode
        df[col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le
        print(f"Encoded {col}: {le.classes_}")

    # C. Lag Features (Autoregression)
    # Predicting 'totalPower_W' is easier if we know what it was 1 min ago.
    # We create lags based on the previous rows.
    target_col = 'totalPower_W'
    df['lag_1'] = df[target_col].shift(1)
    df['lag_5'] = df[target_col].shift(5)
    df['rolling_mean_15'] = df[target_col].rolling(window=15).mean().shift(1)

    # Drop the first few rows that now contain NaNs due to shifting
    df = df.dropna()
    print(f"Data shape after engineering: {df.shape}")

    # ==========================================
    # 3. TRAIN SOLAR MODEL (Supply Side)
    # ==========================================
    print("\n--- 3. Training Solar Model (Supply) ---")
    # Solar only cares about Time. It doesn't care about your machines.
    X_solar = df[['hour', 'day_of_year', 'month']]
    y_solar = df['pvPower_W']

    # Split (No shuffle, strictly temporal split)
    split_idx = int(len(df) * 0.8)
    X_train_s, X_test_s = X_solar.iloc[:split_idx], X_solar.iloc[split_idx:]
    y_train_s, y_test_s = y_solar.iloc[:split_idx], y_solar.iloc[split_idx:]

    model_solar = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=-1)
    model_solar.fit(X_train_s, y_train_s)

    mae_solar = mean_absolute_error(y_test_s, model_solar.predict(X_test_s))
    print(f"Solar Model MAE: {mae_solar:.2f} W (On average, prediction is off by this much)")

    # ==========================================
    # 4. TRAIN CONSUMPTION MODEL (Demand Side)
    # ==========================================
    print("\n--- 4. Training Consumption Model (Demand) ---")
    # Consumption depends on Machines + Time + Recent History
    features_load = [
        'lineA_speed_pct', 'lineB_speed_pct',
        'lineA_mode', 'lineB_mode',
        'hvac_mode', 'lighting_mode',
        'prodA_uph', 'prodB_uph',
        'hour', 'day_of_week',
        'lag_1', 'lag_5', 'rolling_mean_15'
    ]

    X_load = df[features_load]
    y_load = df['totalPower_W']

    X_train_l, X_test_l = X_load.iloc[:split_idx], X_load.iloc[split_idx:]
    y_train_l, y_test_l = y_load.iloc[:split_idx], y_load.iloc[split_idx:]

    model_load = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=-1)
    model_load.fit(X_train_l, y_train_l)

    mae_load = mean_absolute_error(y_test_l, model_load.predict(X_test_l))
    r2_load = r2_score(y_test_l, model_load.predict(X_test_l))

    print(f"Consumption Model MAE: {mae_load:.2f} W")
    print(f"Model Accuracy (R2): {r2_load:.2f} (Closer to 1.0 is better)")

    # ==========================================
    # 5. SAVE ARTIFACTS
    # ==========================================
    print("\n--- 5. Saving Artifacts for Chatbot ---")
    joblib.dump(model_solar, f'{MODEL_DIR}solar_model.pkl')
    joblib.dump(model_load, f'{MODEL_DIR}consumption_model.pkl')
    joblib.dump(encoders, f'{MODEL_DIR}label_encoders.pkl')

    # Also save a tiny sample of data (last row) so the chatbot knows the "Current State"
    # when it first starts up
    last_state = df.iloc[-1].to_dict()
    joblib.dump(last_state, f'{MODEL_DIR}last_known_state.pkl')

    print(f"Success! Models saved in folder '{MODEL_DIR}'")

if __name__ == "__main__":
    train_factory_digital_twin()

--- 1. Loading Data from /content/drive/MyDrive/Fac/factory_simulation_data.csv ---
Loaded 10000 rows. Time range: 2025-11-11 07:02:33.542553+00:00 to 2025-11-18 05:41:33.542553+00:00
--- 2. Preprocessing & Feature Engineering ---
Encoded lineA_mode: ['IDLE' 'OFF' 'RUNNING']
Encoded lineB_mode: ['IDLE' 'OFF' 'RUNNING']
Encoded hvac_mode: ['IDLE' 'OFF' 'RUNNING']
Encoded lighting_mode: ['IDLE' 'OFF' 'RUNNING']
Data shape after engineering: (9985, 22)

--- 3. Training Solar Model (Supply) ---
Solar Model MAE: 58.82 W (On average, prediction is off by this much)

--- 4. Training Consumption Model (Demand) ---
Consumption Model MAE: 75.52 W
Model Accuracy (R2): 1.00 (Closer to 1.0 is better)

--- 5. Saving Artifacts for Chatbot ---
Success! Models saved in folder '/content/drive/MyDrive/Fac/'
