<a href="https://colab.research.google.com/github/Shoaib-Siddiq/L/blob/main/Day_7_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
# --- Part 1: Data Loading and Preprocessing ---
# Load the USA_Housing_toy.csv dataset.
df = pd.read_csv(r'/content/drive/MyDrive/ML Projects/USA_Housing_toy.csv')

In [None]:
print("--- Dataset Info ---")
print(df.head())
print("\nDataset Columns:")
print(df.columns)

--- Dataset Info ---
   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0             79545                    6                          7   
1             79249                    6                          7   
2             61287                    6                          9   
3             63345                    7                          6   
4             59982                    5                          8   

   Avg. Area Number of Bedrooms  Area Population         Price  \
0                          4.09            23087  1.059034e+06   
1                          3.09            40173  1.505891e+06   
2                          5.13            36882  1.058988e+06   
3                          3.26            34310  1.260617e+06   
4                          4.23            26354  6.309435e+05   

                                             Address  
0  208 Michael Ferry Apt. 674\nLaurabury, NE 3701...  
1  188 Johnson Views Suite 079\nLake Kathleen

In [None]:
# Define features (X) and target (y)
# We will use all numerical features to predict the 'Price'.
# The 'Address' column is categorical and not needed for this analysis.
X = df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population']]
y = df['Price']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Standardize the features. This is crucial for regularization methods like Ridge and Lasso.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# --- Part 2: Hyperparameter Tuning with Grid Search for a RandomForestRegressor ---
# We use GridSearchCV to find the optimal combination of hyperparameters.
print("\n--- Part 2: Hyperparameter Tuning with Grid Search for RandomForestRegressor ---")


--- Part 2: Hyperparameter Tuning with Grid Search for RandomForestRegressor ---


In [None]:
# Instantiate the model
rf_model = RandomForestRegressor(random_state=42)

In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}

In [None]:
# Instantiate GridSearchCV with 3-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)

In [None]:
# Fit the grid search to the scaled training data
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [None]:
# Print the best parameters and best score
print("\nBest parameters found:", grid_search.best_params_)
print(f"Best cross-validation R-squared score: {grid_search.best_score_:.4f}")


Best parameters found: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation R-squared score: 0.8399


In [None]:
# Use the best model to make predictions on the test set
best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test_scaled)
r2_rf = r2_score(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"RandomForestRegressor (Best Model) R-squared on test set: {r2_rf:.4f}")
print(f"RandomForestRegressor (Best Model) MSE on test set: {mse_rf:.4f}")

RandomForestRegressor (Best Model) R-squared on test set: 0.8229
RandomForestRegressor (Best Model) MSE on test set: 21122237575.8023


In [None]:
# --- Part 3: Comparing Regression Models with Regularization ---
# This section demonstrates the impact of regularization by comparing a simple
# linear regression model with Ridge and Lasso regression.
print("\n--- Part 3: Comparing Regression Models with Regularization ---")
results = {}


--- Part 3: Comparing Regression Models with Regularization ---


In [None]:
# Simple Linear Regression (without regularization)
# Note: We use the scaled data here for a fair comparison of coefficients.
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
results['Linear Regression'] = {
    'R-squared': r2_score(y_test, y_pred_lr),
    'MSE': mean_squared_error(y_test, y_pred_lr)
}

In [None]:
# Ridge Regression (L2 regularization)
# The alpha parameter controls the strength of the penalty.
ridge_model = Ridge(alpha=100) # Increased alpha to show a more pronounced effect
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)
results['Ridge Regression'] = {
    'R-squared': r2_score(y_test, y_pred_ridge),
    'MSE': mean_squared_error(y_test, y_pred_ridge)
}

In [None]:
# Lasso Regression (L1 regularization)
# The alpha parameter controls the strength of the penalty, and can set coefficients to zero.
lasso_model = Lasso(alpha=100) # Increased alpha to show a more pronounced effect
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)
results['Lasso Regression'] = {
    'R-squared': r2_score(y_test, y_pred_lasso),
    'MSE': mean_squared_error(y_test, y_pred_lasso)
}

In [None]:
# Compare the results
results_df = pd.DataFrame(results).T
print("\n--- Performance Comparison of Regression Models ---")
print(results_df)


--- Performance Comparison of Regression Models ---
                   R-squared           MSE
Linear Regression   0.883531  1.389077e+10
Ridge Regression    0.880941  1.419971e+10
Lasso Regression    0.883533  1.389056e+10


In [None]:
# --- Part 4: Analyze the Impact of Regularization ---
# We'll compare the coefficients of the three models.
print("\n--- Part 4: Analysis of Regularization Impact ---")


--- Part 4: Analysis of Regularization Impact ---


In [None]:
# Get feature names for the coefficient table
feature_names = X.columns.tolist()

In [None]:
# Compare coefficients of the models
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Linear_Coeff': lr_model.coef_,
    'Ridge_Coeff': ridge_model.coef_,
    'Lasso_Coeff': lasso_model.coef_
}).set_index('Feature')

In [None]:
print("\nCoefficient Comparison:")
print(coef_df)
print("\n- Linear Regression coefficients are the largest, as there is no penalty.")
print("- Ridge Regression has reduced the magnitude of all coefficients (L2 penalty).")
print("- Lasso Regression has shrunk coefficients, and for some features, has set them to zero (L1 penalty), effectively performing feature selection.")
print("This regularization helps to prevent overfitting and can improve the model's generalization to new, unseen data.")


Coefficient Comparison:
                               Linear_Coeff    Ridge_Coeff    Lasso_Coeff
Feature                                                                  
Avg. Area Income              234813.419448  222035.328842  234716.039675
Avg. Area House Age           161761.295607  152852.193004  161662.263061
Avg. Area Number of Rooms     107326.634577  100600.238991  107260.731033
Avg. Area Number of Bedrooms   12165.599235   14622.766831   12095.957781
Area Population               146935.152670  138928.378428  146830.501438

- Linear Regression coefficients are the largest, as there is no penalty.
- Ridge Regression has reduced the magnitude of all coefficients (L2 penalty).
- Lasso Regression has shrunk coefficients, and for some features, has set them to zero (L1 penalty), effectively performing feature selection.
This regularization helps to prevent overfitting and can improve the model's generalization to new, unseen data.
