In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)



In [2]:
# Load the preprocessed data
try:
    processed_df = pd.read_csv('../data/processed/cleaned_data.csv')
    X = processed_df.drop(columns=['Recycling_Rate'])
    y = processed_df['Recycling_Rate']
    print("Loaded from processed data")
except:
    print("Processed data not found. Loading raw data and preprocessing........")
    from src.data.preprocess import preprocess_data
    raw_df = pd.read_csv('../data/raw/Waste_Management_and_Recycling_India.csv')
    X, y, _ = preprocess_data(raw_df, is_training=True)
    print("Processed raw data")


print(X.shape)
print(y.shape)

Loaded from processed data
(850, 235)
(850,)


In [3]:
# Advanced Train-Test Split (Group based)
print("\n === Advanced Train Test Split =======")

# For robust evaluation, we need to split by City-Year groups to avoid data leakage
# Let's recreate the original dataframe to extract groups
raw_df = pd.read_csv('../data/raw/Waste_Management_and_Recycling_India.csv')

# Create unique group identifiers
raw_df['City_Year_Group'] = raw_df['City/District'] + '-' + raw_df['Year'].astype(str) + '-' + raw_df['Waste Type']

# Get unique groups
unique_groups = raw_df['City_Year_Group'].unique()
print(f"Total unique groups: {len(unique_groups)}")



Total unique groups: 850


In [5]:
# Split groups into train and test
train_groups, test_groups = train_test_split(unique_groups, test_size=0.2, random_state=42, stratify=raw_df['City/District'].value_counts())


ValueError: Found input variables with inconsistent numbers of samples: [850, 34]