<a href="https://colab.research.google.com/github/divyanshsaxena21/Smart_Water_Monitoring/blob/main/Smart_Water_Monitoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb
import xgboost as xgb

In [None]:
# Load dataset
train_df = pd.read_csv("/content/train.csv")


In [None]:
# Convert Humidity to numeric (force errors to NaN)
train_df["Humidity"] = pd.to_numeric(train_df["Humidity"], errors='coerce')

# Drop rows where numerical columns have non-numeric values
num_cols = ['Residents', 'Temperature', 'Humidity', 'Water_Price',
            'Period_Consumption_Index', 'Guests', 'Appliance_Usage']


In [None]:
train_df[num_cols] = train_df[num_cols].apply(pd.to_numeric, errors='coerce')
train_df.dropna(subset=['Water_Consumption'], inplace=True)  # Target variable should not have NaN


In [None]:
# Select features and target
target = 'Water_Consumption'
features = num_cols + ['Apartment_Type', 'Income_Level', 'Amenities']


In [None]:
# Separate numerical and categorical features
cat_features = ['Apartment_Type', 'Income_Level', 'Amenities']


In [None]:
# Define preprocessing steps
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with mean
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categories
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_features)
])


In [None]:
# Split the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)


In [None]:
# Preprocess data
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

In [None]:
# Train AdaBoostRegressor model
ada_model = AdaBoostRegressor(n_estimators=100, random_state=42)
ada_model.fit(X_train, y_train)
y_val_pred_ada = ada_model.predict(X_val)

# Train LightGBM model
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
y_val_pred_lgb = lgb_model.predict(X_val)

# Train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_val_pred_xgb = xgb_model.predict(X_val)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000464 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 22
[LightGBM] [Info] Start training from score 164.187287


In [None]:
# Combine predictions (Averaging)
y_val_pred_ensemble = (y_val_pred_ada + y_val_pred_lgb + y_val_pred_xgb) / 3


In [None]:
# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n📊 {model_name} Model Performance:")
    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")

evaluate_model(y_val, y_val_pred_ada, "AdaBoostRegressor")
evaluate_model(y_val, y_val_pred_lgb, "LightGBM")
evaluate_model(y_val, y_val_pred_xgb, "XGBoost")
evaluate_model(y_val, y_val_pred_ensemble, "Ensemble (AdaBoost + LightGBM + XGBoost)")



📊 AdaBoostRegressor Model Performance:
MAE: 34.78
MSE: 1591.16
RMSE: 39.89
R² Score: 0.7156

📊 LightGBM Model Performance:
MAE: 7.00
MSE: 127.53
RMSE: 11.29
R² Score: 0.9772

📊 XGBoost Model Performance:
MAE: 8.38
MSE: 169.59
RMSE: 13.02
R² Score: 0.9697

📊 Ensemble (AdaBoost + LightGBM + XGBoost) Model Performance:
MAE: 14.44
MSE: 305.93
RMSE: 17.49
R² Score: 0.9453


In [None]:
# Load test data
test_df = pd.read_csv('/content/test.csv')
test_df["Humidity"] = pd.to_numeric(test_df["Humidity"], errors='coerce')
test_df[num_cols] = test_df[num_cols].apply(pd.to_numeric, errors='coerce')

In [None]:
# Convert Humidity and other numerical columns
test_df["Humidity"] = pd.to_numeric(test_df["Humidity"], errors='coerce')
test_df[num_cols] = test_df[num_cols].apply(pd.to_numeric, errors='coerce')


In [None]:
# Ensure 'Timestamp' exists
timestamps = test_df.get('Timestamp', pd.Series(range(len(test_df))))

In [None]:
# Preprocess test data
X_test = preprocessor.transform(test_df[features])

# Make predictions
ada_predictions = ada_model.predict(X_test)
lgb_predictions = lgb_model.predict(X_test)
xgb_predictions = xgb_model.predict(X_test)
ensemble_predictions = (ada_predictions + lgb_predictions + xgb_predictions) / 3



In [None]:
# Create submission file
pd.DataFrame({'Timestamp': timestamps, 'Water_Consumption': ensemble_predictions}).to_csv("submission_ensemble_xgb_lgbm_ada.csv", index=False)

print("✅ Submission file created: submission_ensemble.csv")


✅ Submission file created: submission_ensemble.csv
