In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor


In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Display first few rows
train_df.head()


Unnamed: 0,Timestamp,Residents,Apartment_Type,Temperature,Humidity,Water_Price,Period_Consumption_Index,Income_Level,Guests,Amenities,Appliance_Usage,Water_Consumption
0,01/01/2002 00,1,Studio,15.31,46.61,1.06,0.97,Low,0,Swimming Pool,0.0,64.85
1,01/01/2002 08,4,,21.01,66.11,2.98,0.91,Upper Middle,1,Swimming Pool,1.0,192.5
2,01/01/2002 16,2,Cottage,12.86,60.86,1.44,1.43,Middle,0,,1.0,116.62
3,02/01/2002 00,2,1BHK,20.16,50.58,1.48,0.91,Middle,-1,Garden,0.0,76.96
4,02/01/2002 08,2,Cottage,16.23,52.25,1.14,1.11,Middle,0,Fountain,0.0,104.7


In [5]:
# Check missing values
print("Missing values in train:\n", train_df.isnull().sum())
print("\nMissing values in test:\n", test_df.isnull().sum())


Missing values in train:
 Timestamp                      0
Residents                      0
Apartment_Type               426
Temperature                  441
Humidity                       0
Water_Price                    0
Period_Consumption_Index       0
Income_Level                 426
Guests                         0
Amenities                   5997
Appliance_Usage              415
Water_Consumption              0
dtype: int64

Missing values in test:
 Timestamp                      0
Residents                      0
Apartment_Type               166
Temperature                  150
Humidity                       0
Water_Price                    0
Period_Consumption_Index       0
Income_Level                 165
Guests                         0
Amenities                   2513
Appliance_Usage              177
dtype: int64


In [9]:
# Display column names for verification
print("Train Columns:", train_df.columns.tolist())
print("Test Columns:", test_df.columns.tolist())


Train Columns: ['Timestamp', 'Residents', 'Apartment_Type', 'Temperature', 'Humidity', 'Water_Price', 'Period_Consumption_Index', 'Income_Level', 'Guests', 'Amenities', 'Appliance_Usage', 'Water_Consumption']
Test Columns: ['Timestamp', 'Residents', 'Apartment_Type', 'Temperature', 'Humidity', 'Water_Price', 'Period_Consumption_Index', 'Income_Level', 'Guests', 'Amenities', 'Appliance_Usage']


In [11]:
# Ensure 'Water_Consumption' column exists
if 'Water_Consumption' not in train_df.columns:
    raise KeyError("Column 'Water_Consumption' is missing in train.csv")


In [13]:
# Handle missing values
# Fill numerical columns with mean
num_cols = train_df.select_dtypes(include=['number']).columns
for col in num_cols:
    train_df[col] = train_df[col].fillna(train_df[col].mean())
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna(test_df[col].mean())



In [15]:
# Fill categorical columns with mode
cat_cols = train_df.select_dtypes(include=['object']).columns
for col in cat_cols:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna(test_df[col].mode()[0])


In [17]:
# Convert categorical variables to strings
for col in cat_cols:
    train_df[col] = train_df[col].astype(str)
    if col in test_df.columns:
        test_df[col] = test_df[col].astype(str)


In [19]:
# Encode categorical features using Label Encoding
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    if col in test_df.columns:
        test_df[col] = test_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    label_encoders[col] = le


In [20]:
# Ensure train and test sets have the same features
missing_cols = set(train_df.columns) - set(test_df.columns) - {'Water_Consumption'}
for col in missing_cols:
    test_df[col] = 0

In [23]:
# Drop unnecessary columns
X_train = train_df.drop(columns=['Timestamp', 'Water_Consumption'])  # Features
y_train = train_df['Water_Consumption']  # Target variable
X_test = test_df.drop(columns=['Timestamp'])


In [25]:
# Train RandomForest, GradientBoosting, and XGBoost models
rf_model = RandomForestRegressor(n_estimators=150, random_state=42)
rf_model.fit(X_train, y_train)


In [32]:
gb_model = GradientBoostingRegressor(n_estimators=150, learning_rate=0.05, random_state=42)
gb_model.fit(X_train, y_train)


In [34]:
xgb_model = XGBRegressor(n_estimators=150, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)


In [36]:
# Make predictions using ensemble averaging
rf_preds = rf_model.predict(X_test)
gb_preds = gb_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)


In [38]:
# Weighted averaging
final_predictions = (rf_preds * 0.4) + (gb_preds * 0.3) + (xgb_preds * 0.3)


In [40]:
# Create submission file
submission = pd.DataFrame({'Timestamp': test_df['Timestamp'], 'Water_Consumption': final_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")


Submission file 'submission.csv' created successfully!
