In [1]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [2]:
# Load the dataset
df = pd.read_csv(r"C:\Users\sukha\Downloads\DATA_SCIENCE\DATASET\Cleaned_DATA.csv")

In [3]:
# --- 1. Drop Columns ---
# Drop 'Country' (constant) and 'Month' (high missing values)
df = df.drop(columns=['Country', 'Month'])



In [4]:
# --- 2. Define Features (X) and Target (y) ---
target_column = 'Reservoir_Water_Storage_BCM'
y = df[target_column]
X = df.drop(columns=[target_column])



In [5]:
# --- 3. Handle Categorical Features (One-Hot Encoding) ---
categorical_cols = ['State', 'District', 'Reservoir Basin Name', 'Reservoir Name']
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
feature_names = X.columns.tolist()


In [6]:
# Important for XGBoost: Sanitize column names
# XGBoost can have issues with special characters like ':', ',', '[', ']', and '<' in feature names.
# Replacing these characters with underscores is a robust practice.
X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)



In [7]:
# --- 4. Split Data ---
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Data Preparation Complete. X_train shape: {X_train.shape}")

Data Preparation Complete. X_train shape: (42900, 584)


In [8]:
# ðŸ”¥ FIX: Convert Pandas DataFrames/Series to NumPy arrays for XGBoost compatibility
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print(f"Data Preparation Complete. X_train shape: {X_train.shape}")

Data Preparation Complete. X_train shape: (42900, 584)


In [9]:
# --- 1. Initialize Model ---
# Initialize the XGBoost Regressor
# A common starting configuration:
# n_estimators: Number of boosting rounds (trees)
# learning_rate: Controls the step size shrinkage (lower is generally safer)
# objective: Specifies the type of loss function (reg:squarederror is standard for regression)
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=100, 
    learning_rate=0.1, 
    random_state=42, 
    n_jobs=-1 # Use all available cores
)

print("Starting XGBoost Regressor Model Training...")


Starting XGBoost Regressor Model Training...


In [10]:
# --- 2. Train Model ---
# Train the model using the training data
xgb_model.fit(X_train, y_train)

print("XGBoost Regressor Model training complete.")

XGBoost Regressor Model training complete.


In [11]:
# --- 1. Make Predictions ---
# Predict the reservoir water storage on the test set
y_pred = xgb_model.predict(X_test)

In [12]:
# --- 2. Calculate Evaluation Metrics ---
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)


In [13]:

# --- 3. Display Results ---
print("--- XGBoost Regressor Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R^2): {r2:.4f}")

--- XGBoost Regressor Model Evaluation ---
Mean Absolute Error (MAE): 0.0538
Mean Squared Error (MSE): 0.0270
Root Mean Squared Error (RMSE): 0.1644
R-squared (R^2): 0.9334


In [14]:
# Optional: Examine residuals
# The difference between actual and predicted values. Ideally, these should be close to zero.
residuals = y_test - y_pred
print(f"\nMean Residual: {residuals.mean():.4f}")
print(f"Standard Deviation of Residuals: {residuals.std():.4f}")


Mean Residual: -0.0012
Standard Deviation of Residuals: 0.1644


In [15]:
import pickle
import os
os.makedirs('models', exist_ok=True)
with open('models/xgboost.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [16]:
# Run this code in each of your model notebooks to properly save models
# Replace the variables with your actual variable names

import pickle
import pandas as pd
import json
import os

# Create models directory
os.makedirs('models', exist_ok=True)

# ====================
# IDENTIFY YOUR FEATURES
# ====================

# Option 1: If you still have X_train available
if 'X_train' in locals():
    if 'X_train' in locals():
    # If X_train is a DataFrame
        if hasattr(X_train, 'columns'):
            feature_names = X_train.columns.tolist()
            categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
            numeric_columns = X_train.select_dtypes(include=[np.number]).columns.tolist()
        else:
            # If X_train is a numpy array, use the original dataset to recover feature names
            data = pd.read_csv(r"C:\Users\sukha\Downloads\DATA_SCIENCE\DATASET\Cleaned_DATA.csv")
            target_column = 'Production'  # ðŸ”¹ Change this to your target
            feature_columns = [col for col in data.columns if col != target_column]
            
            # Handle encoding just like during training
            X_encoded = pd.get_dummies(data[feature_columns], drop_first=True)
            feature_names = X_encoded.columns.tolist()
    
            categorical_columns = data[feature_columns].select_dtypes(include=['object']).columns.tolist()
            numeric_columns = data[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
    
            print("Recovered feature names from original dataset.")

    
    print("Features from X_train:")
    print(f"Categorical: {categorical_columns}")
    print(f"Numeric: {numeric_columns}")

# Option 2: If X_train is not available, get from original data
else:
    # Load your original data
    data = pd.read_csv(r"C:\Users\sukha\Downloads\DATA_SCIENCE\DATASET\Cleaned_DATA.csv")
    
    # Identify your target column (adjust the name!)
    target_column = 'Production'  # CHANGE THIS TO YOUR ACTUAL TARGET
    
    # Get feature columns (everything except target)
    feature_columns = [col for col in data.columns if col != target_column]
    
    categorical_columns = data[feature_columns].select_dtypes(include=['object']).columns.tolist()
    numeric_columns = data[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
    
    print("Features from original data:")
    print(f"Categorical: {categorical_columns}")
    print(f"Numeric: {numeric_columns}")
    
    # If you used one-hot encoding, get those feature names
    X_encoded = pd.get_dummies(data[feature_columns], drop_first=True)
    feature_names = X_encoded.columns.tolist()

# ====================
# SAVE YOUR MODELS
# ====================

# FOR LINEAR REGRESSION
if 'linear_model' in locals() or 'model' in locals():  # Adjust variable name
    model_to_save = linear_model if 'linear_model' in locals() else model
    
    model_data = {
        'model': model_to_save,
        'feature_names': feature_names,
        'categorical_columns': categorical_columns,
        'numeric_columns': numeric_columns
    }
    
    with open('models/linear_regression.pkl', 'wb') as f:
        pickle.dump(model_data, f)
    print("âœ“ Linear Regression model saved")

# FOR RANDOM FOREST
if 'random_forest_model' in locals():
    model_data = {
        'model': random_forest_model,
        'feature_names': feature_names,
        'categorical_columns': categorical_columns,
        'numeric_columns': numeric_columns
    }
    
    with open('models/random_forest.pkl', 'wb') as f:
        pickle.dump(model_data, f)
    print("âœ“ Random Forest model saved")

# FOR XGBOOST
if 'xgb_model' in locals():
    model_data = {
        'model': xgb_model,
        'feature_names': feature_names,
        'categorical_columns': categorical_columns,
        'numeric_columns': numeric_columns
    }
    
    with open('models/xgboost.pkl', 'wb') as f:
        pickle.dump(model_data, f)
    print("âœ“ XGBoost model saved")

# FOR KNN
if 'knn_model' in locals():
    model_data = {
        'model': knn_model,
        'feature_names': feature_names,
        'categorical_columns': categorical_columns,
        'numeric_columns': numeric_columns
    }
    
    with open('models/knn.pkl', 'wb') as f:
        pickle.dump(model_data, f)
    print("âœ“ KNN model saved")

# FOR NEURAL NETWORK
if 'model' in locals():
    model_data = {
        'model': model,
        'feature_names': feature_names,
        'categorical_columns': categorical_columns,
        'numeric_columns': numeric_columns
    }
    
    with open('models/neural_network.pkl', 'wb') as f:
        pickle.dump(model_data, f)
    print("âœ“ Neural Network model saved")

# ====================
# SAVE FEATURE INFO
# ====================

# Get unique values for categorical columns
data = pd.read_csv(r"C:\Users\sukha\Downloads\DATA_SCIENCE\DATASET\Cleaned_DATA.csv")
categorical_values = {}
for col in categorical_columns:
    if col in data.columns:
        categorical_values[col] = data[col].dropna().unique().tolist()

feature_info = {
    'categorical_features': categorical_columns,
    'numeric_features': numeric_columns,
    'all_features': feature_names,
    'categorical_values': categorical_values
}

with open('models/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print("\nâœ“ All models saved with proper feature information!")
print(f"\nFeature summary:")
print(f"  - {len(categorical_columns)} categorical features")
print(f"  - {len(numeric_columns)} numeric features")
print(f"  - {len(feature_names)} total features after encoding")

Recovered feature names from original dataset.
Features from X_train:
Categorical: ['Country', 'State', 'District', 'Month', 'Reservoir Basin Name', 'Reservoir Name']
Numeric: ['Year', 'Full_Reservoir_Capacity_BCM', 'Reservoir_Water_Level_M', 'Reservoir_Water_Storage_BCM']
âœ“ XGBoost model saved

âœ“ All models saved with proper feature information!

Feature summary:
  - 6 categorical features
  - 4 numeric features
  - 595 total features after encoding


In [17]:
import pickle
import pandas as pd
import numpy as np
import os
import json

os.makedirs('models', exist_ok=True)

# Ensure you have your dataset
# Load original data
data = pd.read_csv(r"C:\\Users\\sukha\\Downloads\\DATA_SCIENCE\\DATASET\\Cleaned_DATA.csv")

# --- FIX: Drop the same columns as in training ---
data = data.drop(columns=['Country', 'Month'])

target_column = 'Reservoir_Water_Storage_BCM'  # change this if different
X = data.drop(columns=[target_column])
y = data[target_column]

# Perform encoding just like during training
X_encoded = pd.get_dummies(X, drop_first=True)

# Important: Sanitize column names for feature list consistency
X_encoded.columns = X_encoded.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

feature_names = X_encoded.columns.tolist()
# NOTE: The feature count should now be 584
print(f"Feature count after fix: {len(feature_names)}")

# Identify feature types for feature_info.json
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=[np.number]).columns.tolist()

# Save model + feature info
model_data = {
    "model": xgb_model,  # your trained XGBRegressor
    "feature_names": feature_names,
    "categorical_columns": categorical_columns,
    "numeric_columns": numeric_columns
}

with open('models/xgboost.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("âœ… Model saved successfully with corrected feature mapping.")

# Save categorical value info
# Load original data again to get full categorical values before dropping 'Country', 'Month' (optional, but robust)
original_data = pd.read_csv(r"C:\\Users\\sukha\\Downloads\\DATA_SCIENCE\\DATASET\\Cleaned_DATA.csv")

categorical_values = {
    col: original_data[col].dropna().unique().tolist()
    for col in categorical_columns if col in original_data.columns
}

feature_info = {
    "categorical_features": categorical_columns,
    "numeric_features": numeric_columns,
    "all_features": feature_names,
    "categorical_values": categorical_values
}

with open('models/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print("âœ… Feature info saved.")

Feature count after fix: 584
âœ… Model saved successfully with corrected feature mapping.
âœ… Feature info saved.
