 implements multiple linear regression with Ridge Regularization (L2) to predict the logarithm of home sale prices (LOG_SALEPRICE) using multiple features from the housing dataset(located in data file).

In [None]:
# add libraries for model computation
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

# load home dataset
df = pd.read_csv("https://raw.githubusercontent.com/IBM/ml-learning-path-assets/master/data/predict_home_value.csv")
df = df.drop(['ID'], axis=1)

# log-transform the target variable (SALESPRICE)
df['LOG_SALEPRICE'] = np.log(df['SALEPRICE'])

# prepare features and target
X = df.drop(['SALEPRICE', 'LOG_SALEPRICE'], axis=1)
y = df['LOG_SALEPRICE']



#identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# preprocessing 
# standardScaler: standardizes numerical features to have a mean of 0 and a standard deviation of 1
# OneHotEncoder Encodes categorical variables into a binary format while ignoring unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)



# ridge Regression (L2 regularization), reduces overffiting 
# 'alpha' controls the strength of regularization
model = Pipeline(steps=[
    ('preprocessor', preprocessor), # preprocessing step: scales numerical features and encodes categorical ones
    ('regressor', Ridge(alpha=1.0)) # Ridge regression with L2 regularization (alpha=1.0)

])


#Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#train the model
model.fit(X_train, y_train)

# use trained model to predict the LOG_SALEPRICE on the testing set
y_pred = model.predict(X_test)

# print the model's performance metrics
print("Multiple Linear Regression with Ridge")
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"RÂ² Score: {r2:.2f}")
