### Assignment-4

**Objective:**

Understand and implement model evaluation using cross-validation and improve model performance by hyperparameter tuning.

Step 1: Import Libraries and Load Data

In [1]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset (you can replace with your own CSV if available)
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame

print("Data shape:", df.shape)
df.head()

Data shape: (20640, 9)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


Step 2: Load Dataset and Prepare Features and Target

In [2]:
# Step 2: Load Dataset and Prepare Features and Target
X = df.drop(columns=["MedHouseVal"])   # features
y = df["MedHouseVal"]                  # target

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (20640, 8)
Target shape: (20640,)


Step 3: Implement Cross-Validation

In [3]:
# Step 3: Implement Cross-Validation
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(random_state=42))
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, scoring="neg_root_mean_squared_error", cv=cv)

print("Cross-validation RMSE scores:", -scores)
print("Mean CV RMSE:", -scores.mean())

Cross-validation RMSE scores: [0.50490551 0.50785638 0.50459087 0.48706648 0.51267768]
Mean CV RMSE: 0.5034193825634511


Step 4: Hyperparameter Tuning with GridSearchCV

In [4]:
# Step 4: Hyperparameter Tuning with GridSearchCV
param_grid = {
    "model__n_estimators": [50, 100],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=1
)
grid.fit(X, y)

print("Best Parameters:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best CV RMSE: 0.5034193825634511


Step 5: Evaluate Best Model on Full Dataset

In [5]:
# Step 5: Evaluate Best Model on Full Dataset
best_model = grid.best_estimator_
y_pred = best_model.predict(X)

rmse = mean_squared_error(y, y_pred)
rmse = np.sqrt(rmse) # Calculate RMSE manually
r2 = r2_score(y, y_pred)

print("Final Evaluation on Full Dataset:")
print("RMSE:", rmse)
print("R2 Score:", r2)

Final Evaluation on Full Dataset:
RMSE: 0.18611768744985419
R2 Score: 0.9739853660957807
