In [137]:
#Refer the below link for detailed basic Linear Regression without pipeline concept
#https://colab.research.google.com/drive/1rsqTs7PeWNGPfIeWZAlvw6FSJRA7m4S4#scrollTo=luiZzhzyuG_M

In [138]:
# Import the necessary libraries

# Data handling libraries
import pandas as pd          # For loading and manipulating tabular data
import numpy as np           # For numerical operations

# Import Scikit-Learn modules for preprocessing and modeling
from sklearn.model_selection import train_test_split    # Train Test Split
from sklearn.impute import SimpleImputer                # Handle missing values
from sklearn.preprocessing import OneHotEncoder         # Encode categorical data
from sklearn.preprocessing import StandardScaler        # Feature scaling
from sklearn.pipeline import Pipeline                   # Pipeline
from sklearn.compose import ColumnTransformer           # Column Transformer
from sklearn.linear_model import LinearRegression       # Linear Regression model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # Error Metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV   # Cross Validation Grid Search and Random Search

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [139]:
# Load housing dataset from CSV file. It is loaded as Dataframe
data = pd.read_csv('/content/sample_data/Housing.csv')

In [140]:
# Check the shape of the dataset (Rows, Columns)
print(f"Dataset Shape: {data.shape}")

Dataset Shape: (545, 13)


In [141]:
# Preview the first few rows in the dataframe
data.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished


In [142]:
#Spliting the Columns into X and y. Independent columns into X and dependent column into y.
# price is the target value.
X = data.drop(columns=['price'], axis=1)
y = data['price']

In [143]:
# Split data into training and testing sets.
# Split: 80% Training, 20% Testing
# random_state ensures reproducibility
# random_state=42 controls - Which rows go into training - Which rows go into testing
# CV happens only inside the training set.

train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=42)

# Check the shape of the each cateory
train_X.shape,test_X.shape,train_y.shape,test_y.shape

((436, 12), (109, 12), (436,), (109,))

In [144]:
num_cols = X.select_dtypes(exclude='object').columns
cat_cols = X.select_dtypes(include='object').columns

In [145]:
# Numerical Feature Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),     # Fill missing numbers
    ('scaler', StandardScaler())                     # Scale numerical values
])
num_pipeline

In [146]:
# Categorical Feature Pipeline
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categories
    ('encoder', OneHotEncoder(handle_unknown='ignore'))    # Convert text → numbers
])
cat_pipeline

In [147]:
# Combine Pipelines using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])
preprocessor

In [148]:
# Build the Full ML Pipeline.
# Raw Data → Clean Data → Encoded Data → Scaled Data → Model.
# All in one object
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', LinearRegression())
])
model_pipeline

In [149]:
# Train the Pipeline
model_pipeline.fit(train_X, train_y)

In [150]:
# Make Predictions
linear_predict = model_pipeline.predict(test_X)

In [151]:
# Calculate Metrics
mae = mean_absolute_error(test_y, linear_predict)
mse = mean_squared_error(test_y, linear_predict)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, linear_predict)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 970043.4039201644
Mean Squared Error: 1754318687330.6682
Root Mean Squared Error: 1324506.9600914402
R-squared: 0.6529242642153175


In [152]:
# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'model__fit_intercept': [True, False],
    'model__positive': [True, False]
}

In [153]:
# Apply Grid Search Cross Validation
# cv=5 → 5-fold cross-validation
# scoring='r2' → optimize for R²

grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2'
)

In [154]:
# GridSearch will try all parameter combinations
# Train multiple models and Selects the best performing one
grid_search.fit(train_X, train_y)

print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Best Params: {'model__fit_intercept': True, 'model__positive': True}
Best CV Score: 0.6470401792343207


In [155]:
best_model = grid_search.best_estimator_

In [156]:
# Predict Using Best Model
best_pred = best_model.predict(test_X)

In [157]:
# Calculate Metrics
mae = mean_absolute_error(test_y, best_pred)
mse = mean_squared_error(test_y, best_pred)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, best_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 970043.4039201642
Mean Squared Error: 1754318687330.6677
Root Mean Squared Error: 1324506.96009144
R-squared: 0.6529242642153177


In [158]:
random_search = RandomizedSearchCV(
    estimator=model_pipeline,      # Our full preprocessing + model pipeline
    param_distributions=param_grid,
    n_iter=4,                       # Number of random combinations to try
    cv=5,                           # 5-fold cross-validation
    scoring='r2',                   # Metric to optimize
    random_state=42,                # Reproducibility
)

In [159]:
# GridSearch will tries random combinations
# Train multiple models and Selects the best performing one
random_search.fit(train_X, train_y)

print("Best Params:", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)

Best Params: {'model__positive': True, 'model__fit_intercept': True}
Best CV Score: 0.6470401792343207


In [160]:
random_best_model = random_search.best_estimator_

In [161]:
# Predict Using Best Model
random_best_pred = random_best_model.predict(test_X)

In [162]:
# Calculate Metrics
mae = mean_absolute_error(test_y, random_best_pred)
mse = mean_squared_error(test_y, random_best_pred)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, random_best_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 970043.4039201642
Mean Squared Error: 1754318687330.6677
Root Mean Squared Error: 1324506.96009144
R-squared: 0.6529242642153177
