In [1]:
# Installing required packages
!pip install xgboost pandas numpy scikit-learn matplotlib



Import Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb


In [4]:
df = pd.read_csv("C:/Users/user/OneDrive/dataset/train.csv")
df_test = pd.read_csv("C:/Users/user/OneDrive/dataset/test.csv")


In [6]:
def process_datetime(df):
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
    df["Hour"] = df["Timestamp"].dt.hour
    df["Day"] = df["Timestamp"].dt.day
    df["Month"] = df["Timestamp"].dt.month
    df["DayOfWeek"] = df["Timestamp"].dt.dayofweek
    df["IsWeekend"] = (df["DayOfWeek"] >= 5).astype(int)
    df["IsMorningPeak"] = ((df["Hour"] >= 6) & (df["Hour"] <= 9)).astype(int)
    df.drop(columns=["Timestamp"], inplace=True)
    return df

df = process_datetime(df)
df_test = process_datetime(df_test)

In [7]:
def fill_missing_values(df):
    num_cols = df.select_dtypes(include=["number"]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    cat_cols = df.select_dtypes(include=["object"]).columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

df = fill_missing_values(df)
df_test = fill_missing_values(df_test)

In [8]:
def encode_categorical(df, label_encoders=None, is_train=True):
    if label_encoders is None:
        label_encoders = {}
    
    cat_cols = df.select_dtypes(include=["object"]).columns
    for col in cat_cols:
        if is_train:
            label_encoders[col] = LabelEncoder()
            df[col] = label_encoders[col].fit_transform(df[col])
        else:
            df[col] = df[col].map(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)
    return df, label_encoders

df, label_encoders = encode_categorical(df, is_train=True)
df_test, _ = encode_categorical(df_test, label_encoders, is_train=False)

In [9]:
assert df.select_dtypes(include=["object"]).empty, "Data still contains categorical values!"

X = df.drop(columns=["Water_Consumption"], errors='ignore')
y = df["Water_Consumption"] if "Water_Consumption" in df.columns else None
X_test = df_test.copy()

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [15]:
from xgboost import XGBRegressor

In [16]:
# 1. Split data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 2. Initialize model (compatible with all versions)
params = {
    'objective': 'reg:squarederror',
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'colsample_bytree': 0.7,
    'subsample': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'rmse'
}

# Version detection and training
try:
    # For newer XGBoost (>=1.3.0)
    xgb_model = xgb.XGBRegressor(**params, early_stopping_rounds=50)
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=10
    )
except TypeError:
    # For older XGBoost (<1.3.0)
    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=10
    )

# 3. Evaluation
y_val_pred = xgb_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"VALIDATION RMSE: {rmse:.4f}")

# 4. Final predictions
y_test_pred = xgb_model.predict(X_test_scaled)
y_test_pred = np.clip(y_test_pred, a_min=0, a_max=None)

[0]	validation_0-rmse:74.24900
[10]	validation_0-rmse:69.26950
[20]	validation_0-rmse:64.99885
[30]	validation_0-rmse:60.99422
[40]	validation_0-rmse:57.53599
[50]	validation_0-rmse:54.36019
[60]	validation_0-rmse:51.48022
[70]	validation_0-rmse:48.79886
[80]	validation_0-rmse:46.45341
[90]	validation_0-rmse:44.23751
[100]	validation_0-rmse:42.26123
[110]	validation_0-rmse:40.38527
[120]	validation_0-rmse:38.60561
[130]	validation_0-rmse:37.03044
[140]	validation_0-rmse:35.58088
[150]	validation_0-rmse:34.22389
[160]	validation_0-rmse:32.95596
[170]	validation_0-rmse:31.76429
[180]	validation_0-rmse:30.67279
[190]	validation_0-rmse:29.68752
[200]	validation_0-rmse:28.70975
[210]	validation_0-rmse:27.77328
[220]	validation_0-rmse:26.93884
[230]	validation_0-rmse:26.14058
[240]	validation_0-rmse:25.37917
[250]	validation_0-rmse:24.71181
[260]	validation_0-rmse:24.07095
[270]	validation_0-rmse:23.51330
[280]	validation_0-rmse:22.98614
[290]	validation_0-rmse:22.47755
[300]	validation_0-rm

In [27]:
# 1. Generate predictions
y_test_pred = xgb_model.predict(X_test_scaled)
y_test_pred_rounded = np.round(y_test_pred, 2)

# 2. Create submission DataFrame 
submission = pd.DataFrame({
    'Timestamp': pd.read_csv("C:/Users/user/OneDrive/dataset/test.csv")['Timestamp'],
    'Water_Consumption': np.round(y_test_pred, 2)  # Strict 2 decimal rounding
})

# 3. Save without index
submission.to_csv("C:/Users/user/OneDrive/dataset/submission_final.csv", index=False)

# 4. Show first 5 rows 
print(submission.head())

       Timestamp  Water_Consumption
0  11/10/2014 16         311.750000
1  12/10/2014 00         178.289993
2  12/10/2014 08          77.940002
3  12/10/2014 16         127.629997
4  13/10/2014 00         126.970001
