In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load csv containing housing prices into Pandas DataFrame
df = pd.read_csv("HousePrice.csv")

# # Get basic information about data
# print(df.info())

# Drop columns which have too many missing values (more than 50%) and Id column as well
missing_ratio = df.isna().mean()
cols_to_drop = missing_ratio[missing_ratio > 0.5].index
cols_to_drop = list(cols_to_drop) + ["Id"]
df_model = df.drop(columns=cols_to_drop)

# Group numeric and categorical features separately for preprocessing
numeric_cols = df_model.select_dtypes(include=["int64", "float64"]).columns
numeric_cols = numeric_cols.drop("SalePrice") # exclude the target column
categorical_cols = df_model.select_dtypes(include=["object"]).columns

# Impute the missing values with median strtegy for numeric features
num_imputer = SimpleImputer(strategy="median")
numeric_df = pd.DataFrame(
    num_imputer.fit_transform(df_model[numeric_cols]),
    columns=numeric_cols, index=df_model.index
)

# Impute the missing values with most_frequent strategy for categorical features
cat_imputer = SimpleImputer(strategy="most_frequent")
categorical_df = pd.DataFrame(
    cat_imputer.fit_transform(df_model[categorical_cols]),
    columns=categorical_cols, index=df_model.index
)

# Convert categorical features to dummy variables (drop_first=True to avoid multicollinearity)
dummies_categorical_df = pd.get_dummies(categorical_df, dtype=int, drop_first=True)

# Prepare features (X) and target (y)
X = pd.concat([numeric_df, dummies_categorical_df], axis=1)
y = df_model["SalePrice"]

# Scale ALL features for Ridge/Lasso including dummy variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train (70%) and test (30%) sets
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

# Fit Linear Regression Model on training dataset
linreg_model = linear_model.LinearRegression()
linreg_model.fit(X_train, y_train)

# Predict on train and test datasets
prediction_train = linreg_model.predict(X_train)
prediction_test = linreg_model.predict(X_test)

# Evaluate and display model accuracy with Mean Absolute Error for both train and test datasets
print(f"Mean absolute error in train dataset with Linear Regression Model: {mean_absolute_error(y_train, prediction_train):.2f}")
print(f"Mean absolute error in test dataset with Linear Regression Model: {mean_absolute_error(y_test, prediction_test):.2f}")

# Evaluate and display model accuracy with R2 Score for both train and test datasets
print(f"\nR2 score in train dataset with Linear Regression Model: {r2_score(y_train, prediction_train):.2f}")
print(f"R2 score in test dataset with Linear Regression Model: {r2_score(y_test, prediction_test):.2f}")

# Ridge Regression (L2) to find alpha for MAE < $20000
alphas = np.logspace(-2, 4, 20)

ridge_chosen = None
for a in alphas:
    ridge = linear_model.Ridge(alpha=a, max_iter=20000, tol=1e-4)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    if mae < 20000:
        ridge_chosen = (a, mae)
        print(f"\nRidge alpha = {a:.4f} (Test MAE={mae:,.0f})\n")
        break

if ridge_chosen is None:
    print("No Ridge alpha in this grid reached MAE < 20000. Try expanding the alpha range.")

# Lasso Regression (L1) to find alpha for MAE < $20000
lasso_chosen = None
lasso_model = None

for a in alphas:
    lasso = linear_model.Lasso(alpha=a, max_iter=200000, tol=1e-4, selection="random")
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    if mae < 20000:
        lasso_chosen = (a, mae)
        lasso_model = lasso
        print(f"Lasso alpha = {a:.4f} (Test MAE={mae:,.0f})\n")
        break

if lasso_chosen is None:
    print("No Lasso alpha in this grid reached MAE < 20000. Try expanding the alpha range.")
else:
    # Report 5 most significant continuous features
    # Continuous = original numeric columns (not one-hot encoded dummy variables)
    feature_names = X.columns
    coef_df = pd.DataFrame({"feature": feature_names, "coef": lasso_model.coef_})
    coef_df["abs_coef"] = coef_df["coef"].abs()

    top5_continuous = (
        coef_df[coef_df["feature"].isin(numeric_cols)]
        .sort_values("abs_coef", ascending=False)
        .head(5)
    )
    print("Top 5 most significant continuous (numeric) features by |coef| in Lasso:")
    print(top5_continuous[["feature", "coef"]].to_string(index=False))

Mean absolute error in train dataset with Linear Regression Model: 13468.74
Mean absolute error in test dataset with Linear Regression Model: 20988.68

R2 score in train dataset with Linear Regression Model: 0.93
R2 score in test dataset with Linear Regression Model: 0.39

Ridge alpha = 3.3598 (Test MAE=19,914)



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso alpha = 61.5848 (Test MAE=19,939)

Top 5 most significant continuous (numeric) features by |coef| in Lasso:
    feature         coef
  GrLivArea 24752.262811
 BsmtFinSF1 10595.256868
OverallQual 10531.710797
   2ndFlrSF 10098.762238
    LotArea  7210.901314
