In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load CSV containing housing prices into Pandas DataFrame
df = pd.read_csv("HousePrice.csv")

# # Get basic information about data
# print(df.info())

# Drop columns have too many missing values (more than 40%) and Id column
missing_ratio = df.isna().mean()
cols_to_drop = missing_ratio[missing_ratio > 0.5].index
cols_to_drop = list(cols_to_drop) + ["Id"]
df_model = df.drop(columns=cols_to_drop)

# Group numeric and categorical features separately for preprocessing
numeric_cols = df_model.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = df_model.select_dtypes(include=["object"]).columns
numeric_cols = numeric_cols.drop("SalePrice") # exclude the target column

# Impute the missing values with median for numeric features
num_imputer = SimpleImputer(strategy="median")
numeric_df = pd.DataFrame(
    num_imputer.fit_transform(df_model[numeric_cols]),
    columns=numeric_cols, index=df_model.index
)

# Impute the missing values with most_frequent strategy for categorical features
cat_imputer = SimpleImputer(strategy="most_frequent")
categorical_df = pd.DataFrame(
    cat_imputer.fit_transform(df_model[categorical_cols]),
    columns=categorical_cols, index=df_model.index
)

# # Scale numeric feature columns using StandardScaler
# scaler = StandardScaler()
# scaled_numeric = scaler.fit_transform(numeric_df) # returns NumPy array

# # Convert scaled numeric data to a DataFrame and preserve column names and index
# scaled_numeric_df = pd.DataFrame(scaled_numeric, columns=numeric_cols, index=df_model.index)

# Convert categorical features to dummy variables (drop_first=True to avoid multicollinearity)
dummies_categorical_df = pd.get_dummies(categorical_df, dtype=int, drop_first=True)

# Prepare features (X) and target (y)
# X = pd.concat([scaled_numeric_df, dummies_categorical_df], axis=1)
X = pd.concat([numeric_df, dummies_categorical_df], axis=1)
y = df_model["SalePrice"]

print("Rows, Features after preprocessing:", X.shape)
print("Rows (should be >=1300):", X.shape[0])

# scale ALL features for Ridge/Lasso (fit scaler on train only)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train (70%) and test (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

# Train the linear regression model on the train dataset
linreg_model = linear_model.LinearRegression()
linreg_model.fit(X_train, y_train)

# Predict on train and test datasets
prediction_train = linreg_model.predict(X_train)
prediction_test = linreg_model.predict(X_test)

# Evaluate and display model accuracy with Mean Absolute Error for both train and test datasets
print(f"Mean absolute error in train dataset: {mean_absolute_error(y_train, prediction_train):.2f}")
print(f"Mean absolute error in test dataset: {mean_absolute_error(y_test, prediction_test):.2f}")

# Evaluate and display model accuracy with R2 Score for both train and test datasets
print(f"\nR2 score in train dataset: {r2_score(y_train, prediction_train):.2f}")
print(f"R2 score in test dataset: {r2_score(y_test, prediction_test):.2f}")

# Ridge (L2)
alphas = [0.1, 1, 10, 50, 100, 200, 500, 1000]
print("\nRidge")
for alpha in alphas:
    ridge = linear_model.Ridge(alpha=alpha, max_iter=10000, tol=1e-4)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    print(f"Alpha={alpha:<6} Test MAE={mean_absolute_error(y_test, y_pred):.2f} Test R2={r2_score(y_test, y_pred):.4f}")

# Lasso (L1)
print("\nLasso")
for alpha in alphas:
    lasso = linear_model.Lasso(alpha=alpha, max_iter=200000, tol=1e-4, selection="random")
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    print(f"Alpha={alpha:<6} Test MAE={mean_absolute_error(y_test, y_pred):.2f} Test R2={r2_score(y_test, y_pred):.4f}")

Rows, Features after preprocessing: (1460, 233)
Rows (should be >=1300): 1460
Mean absolute error in train dataset: 12780.52
Mean absolute error in test dataset: 20079.94

R2 score in train dataset: 0.94
R2 score in test dataset: 0.73

Ridge
Alpha=0.1    Test MAE=20068.00 Test R2=0.7262
Alpha=1      Test MAE=19993.74 Test R2=0.7276
Alpha=10     Test MAE=19743.93 Test R2=0.7329
Alpha=50     Test MAE=19444.39 Test R2=0.7439
Alpha=100    Test MAE=19277.81 Test R2=0.7517
Alpha=200    Test MAE=19156.39 Test R2=0.7613
Alpha=500    Test MAE=19199.27 Test R2=0.7736
Alpha=1000   Test MAE=19652.27 Test R2=0.7774

Lasso
Alpha=0.1    Test MAE=20082.66 Test R2=0.7261
Alpha=1      Test MAE=20064.97 Test R2=0.7264
Alpha=10     Test MAE=19904.48 Test R2=0.7286
Alpha=50     Test MAE=19590.34 Test R2=0.7340
Alpha=100    Test MAE=19304.80 Test R2=0.7375
Alpha=200    Test MAE=18787.18 Test R2=0.7421
Alpha=500    Test MAE=18617.89 Test R2=0.7481
Alpha=1000   Test MAE=18913.23 Test R2=0.7505
