In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

https://chatgpt.com/c/69855d33-8688-832d-afbc-f49214ecb1e1

# Load CSV containing housing prices into Pandas DataFrame
df = pd.read_csv("HousePrice.csv")

# Get basic information about data
# print(df.info())

# Drop columns have too many missing values (more than 40%) and Id column
missing_ratio = df.isna().mean()
cols_to_drop = missing_ratio[missing_ratio > 0.4].index
cols_to_drop = list(cols_to_drop) + ["Id"]
df_model = df.drop(columns=cols_to_drop)

target = "SalePrice"
# Group numeric and categorical features separately for preprocessing
numeric_cols = df_model.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = df_model.select_dtypes(include=["object"]).columns
numeric_cols = numeric_cols.drop(target) # exclude the target column

# Impute the missing values with median
num_imputer = SimpleImputer(strategy="median")
numeric_df = pd.DataFrame(
    num_imputer.fit_transform(df_model[numeric_cols]),
    columns=numeric_cols, index=df_model.index
)

cat_imputer = SimpleImputer(strategy="most_frequent")
categorical_df = pd.DataFrame(
    cat_imputer.fit_transform(df_model[categorical_cols]),
    columns=categorical_cols, index=df_model.index
)

# Scale numeric feature columns using StandardScaler (returns a NumPy array without column names)
scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(numeric_df) # returns NumPy array

# Convert scaled numeric data to a DataFrame and preserve column names and index
scaled_numeric_df = pd.DataFrame(scaled_numeric, columns=numeric_cols, index=df_model.index)

# Convert categorical features to dummy variables (drop first to avoid multicollinearity)
dummies_categorical_df = pd.get_dummies(categorical_df, dtype=int, drop_first=True) # drop_first=True used to avoid multicollinearity

# Prepare features (X) and target (y)
X = pd.concat([scaled_numeric_df, dummies_categorical_df], axis=1)
y = df_model["SalePrice"]

# Split data into train (70%) and test (30%) sets, random_state parameter makes the random split reproducible
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the linear regression model on the training dataset
linreg_model = LinearRegression()
linreg_model.fit(X_train, y_train)

# Predict on train and test datasets
prediction_train = linreg_model.predict(X_train)
prediction_test = linreg_model.predict(X_test)

# Evaluate and display model accuracy with Mean Absolute Error for both train and test datasets
print(f"Mean absolute error in train dataset: {mean_absolute_error(y_train, prediction_train):.2f}")
print(f"Mean absolute error in test dataset: {mean_absolute_error(y_test, prediction_test):.2f}")

# Evaluate and display model accuracy with R2 Score for both train and test datasets
print(f"\nR2 score in train dataset: {r2_score(y_train, prediction_train):.2f}")
print(f"R2 score in test dataset: {r2_score(y_test, prediction_test):.2f}")

from sklearn.linear_model import Ridge

alphas = [0.1, 1, 10, 50, 100, 200, 500, 1000]

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)

    y_test_pred = ridge.predict(X_test)
    mae = mean_absolute_error(y_test, y_test_pred)

    print(f"Alpha={alpha:<6} Test MAE={mae:.2f}")


Mean absolute error in train dataset: 12987.64
Mean absolute error in test dataset: 19164.90

R2 score in train dataset: 0.93
R2 score in test dataset: 0.73
Alpha=0.1    Test MAE=19279.60
Alpha=1      Test MAE=19973.44
Alpha=10     Test MAE=19050.92
Alpha=50     Test MAE=18948.81
Alpha=100    Test MAE=19199.74
Alpha=200    Test MAE=19581.29
Alpha=500    Test MAE=20305.18
Alpha=1000   Test MAE=21168.57
