# Feature Engineering

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt


In [41]:
# Loading the data set

# Load dataset
df = pd.read_csv('../data/transformed_correlated.csv', header=None)

# Add column names
col_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV', 'BIAS_COL']
df.columns = col_names

# Remove non-numeric columns and handle potential issues with headers
df = df.apply(pd.to_numeric, errors='coerce').dropna()

# Remove the last column
df = df.iloc[:, :-1]

# Replace infinite values with NaN and then drop them
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [42]:
# # Create new features
# df['RM_AGE'] = df['RM'] * df['AGE']
# df['DIS_TAX'] = df['DIS'] * df['TAX']
# df['LSTAT_SQ'] = df['LSTAT'] ** 2


In [43]:

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df.drop('MEDV', axis=1))
poly_feature_names = poly.get_feature_names_out(df.drop('MEDV', axis=1).columns)
df_poly = pd.DataFrame(poly_features, columns=poly_feature_names)

In [44]:
# Combine original and polynomial features
df_combined = pd.concat([df.reset_index(drop=True), df_poly], axis=1)

In [45]:

# Split the data into features and target
X = df.drop('MEDV', axis=1)
y = df['MEDV']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [46]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Without using GridSearchCV


In [47]:

mae = []
mse = []
r2 = []

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
r2.append(r2_score(y_test, lr_pred))
mae.append(mean_absolute_error(y_test, lr_pred))
mse.append(mean_squared_error(y_test, lr_pred))

# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
ridge_pred = ridge.predict(X_test_scaled)
r2.append(r2_score(y_test, ridge_pred))
mae.append(mean_absolute_error(y_test, ridge_pred))
mse.append(mean_squared_error(y_test, ridge_pred))

# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
lasso_pred = lasso.predict(X_test_scaled)
r2.append(r2_score(y_test, lasso_pred))
mae.append(mean_absolute_error(y_test, lasso_pred))
mse.append(mean_squared_error(y_test, lasso_pred))

df_metrics = pd.DataFrame({'MAE':mae, 'MSE':mse,'R2':r2}, index=['Linear','Ridge','Lasso']) 

In [48]:
df_metrics

Unnamed: 0,MAE,MSE,R2
Linear,0.332631,0.165751,0.574287
Ridge,0.327392,0.160926,0.586681
Lasso,0.278374,0.137529,0.646774


## Using gridsearchcv

In [49]:
# Define parameter grids
param_grid_ridge = {'alpha': [0.1, 1, 10, 100]}
param_grid_lasso = {'alpha': [0.1, 1, 10, 100]}

# Initialize models
ridge = Ridge()
lasso = Lasso()

# Grid Search CV for Ridge Regression
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=param_grid_ridge, cv=5, scoring='r2')
grid_search_ridge.fit(X_train_scaled, y_train)

# Grid Search CV for Lasso Regression
grid_search_lasso = GridSearchCV(estimator=lasso, param_grid=param_grid_lasso, cv=5, scoring='r2')
grid_search_lasso.fit(X_train_scaled, y_train)

# Get the best models
best_ridge = grid_search_ridge.best_estimator_
best_lasso = grid_search_lasso.best_estimator_

# Initialize lists to store metrics
mae_grid = []
mse_grid = []
r2_grid = []

# Linear Regression (No Grid Search needed)
lr_pred_grid = lr.predict(X_test_scaled)
r2_grid.append(r2_score(y_test, lr_pred_grid))
mae_grid.append(mean_absolute_error(y_test, lr_pred_grid))
mse_grid.append(mean_squared_error(y_test, lr_pred_grid))

# Ridge Regression
ridge_pred_grid= best_ridge.predict(X_test_scaled)
r2_grid.append(r2_score(y_test, ridge_pred_grid))
mae_grid.append(mean_absolute_error(y_test, ridge_pred_grid))
mse_grid.append(mean_squared_error(y_test, ridge_pred_grid))

# Lasso Regression
lasso_pred_grid= best_lasso.predict(X_test_scaled)
r2_grid.append(r2_score(y_test, lasso_pred_grid))
mae_grid.append(mean_absolute_error(y_test, lasso_pred_grid))
mse_grid.append(mean_squared_error(y_test, lasso_pred_grid))

# Create DataFrame for metrics
df_metrics_grid = pd.DataFrame({'MAE':mae_grid, 'MSE':mse_grid, 'R2':r2_grid}, index=['Linear', 'Ridge', 'Lasso'])

In [50]:
df_metrics_grid

Unnamed: 0,MAE,MSE,R2
Linear,0.332631,0.165751,0.574287
Ridge,0.294173,0.132516,0.659647
Lasso,0.278374,0.137529,0.646774
