In [2]:
# Preprocessing Pipeline

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load data
train = pd.read_csv("../DATA/train_v9rqX0R.csv")

# Target variable
y = train['Item_Outlet_Sales']
X = train.drop(columns=['Item_Outlet_Sales'])

# Step 1: Fix inconsistent labels

X['Item_Fat_Content'] = X['Item_Fat_Content'].replace({
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
})


# Step 2: Define columns

numeric_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
categorical_features = ['Item_Fat_Content', 'Item_Type', 
                        'Outlet_Identifier', 'Outlet_Size', 
                        'Outlet_Location_Type', 'Outlet_Type']

# Step 3: Define transformers

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Step 4: Column transformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


# Step 5: Fit + transform
X_processed = preprocessor.fit_transform(X)

# Get feature names after transformation
cat_ohe = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = cat_ohe.get_feature_names_out(categorical_features)

all_feature_names = numeric_features + list(cat_feature_names)

X_processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
                              columns=all_feature_names)

print("Train shape after preprocessing:", X_processed_df.shape)



Train shape after preprocessing: (8523, 41)


In [3]:
#SPlitting the data
X_train, X_valid, y_train, y_valid = train_test_split(
    X_processed_df, y, test_size=0.2, random_state=42
)

In [4]:
X_processed_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,-0.831187,-0.970732,1.747454,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.630810,-0.908111,-1.489023,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.108727,-0.956917,0.010040,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.510904,-1.281758,0.660050,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,-0.918719,-1.281758,-1.399220,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,-1.407246,-0.181193,1.180783,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8519,-1.048835,-0.371154,-0.527301,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8520,-0.523639,-0.599784,-0.897208,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8521,-1.325628,1.532880,-0.607977,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
# ML Pipeline with Cross Validation & Hyperparameter Tuning (using preprocessed data)

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd



# Define models with hyperparameter grids (no preprocessing here)
models = {
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge": {
        "model": Ridge(),
        "params": {"alpha": [0.1, 1.0, 10.0]}
    },
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "XGBoost": {
        "model": XGBRegressor(objective="reg:squarederror", random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 6],
            "learning_rate": [0.05, 0.1]
        }
    }
}

# K-Fold setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, mp in models.items():
    print(f" Running GridSearchCV for {name} ...")
    grid = GridSearchCV(mp["model"], mp["params"], cv=kf,
                        scoring="neg_root_mean_squared_error",
                        n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)

    results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "Best RMSE": -grid.best_score_
    })

# Convert results to dataframe
results_df = pd.DataFrame(results)
print("\n Model Comparison Results")
display(results_df)

# Pick best model
best_model_name = results_df.loc[results_df["Best RMSE"].idxmin(), "Model"]
print(f"\n Best Model: {best_model_name}")

# Retrain best model on full training set
best_params = results_df.loc[results_df["Best RMSE"].idxmin(), "Best Params"]
final_model = models[best_model_name]["model"].set_params(**best_params)
final_model.fit(X_train, y_train)



 Running GridSearchCV for LinearRegression ...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
 Running GridSearchCV for Ridge ...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
 Running GridSearchCV for RandomForest ...
Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
import joblib

# Save the final trained best model
joblib.dump(final_model, "best_model.pkl")
print(" Best model saved as best_model.pkl")

