In [43]:
import pandas as pd

df = pd.read_csv('data/train.csv')

Let's clean our data

In [44]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('data/train.csv')

# Separate features and target
X = df.drop('SalePrice', axis=1)  # assuming 'SalePrice' is your target
y = df['SalePrice']

# Get categorical columns from features only
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Handle missing values first
X_clean = X.copy()
# for col in categorical_cols:
#     X_clean[col] = X_clean[col].fillna('Missing')

# Fill numeric missing values
numeric_cols = X_clean.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    X_clean[col] = X_clean[col].fillna(X_clean[col].median())

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'  # keep other (non-categorical) columns
)

transformed = preprocessor.fit_transform(X_clean)
feature_names = preprocessor.get_feature_names_out()


final_df = pd.DataFrame(transformed, columns=feature_names) # type: ignore


Let's use a baseline model first:

In [45]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
# Use all your training data for CV
cv_scores = cross_val_score(lr_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)

# Calculate percentage error
mean_target = y.mean()
cv_rmse_mean = cv_rmse.mean()
cv_rmse_std = cv_rmse.std()

# Calculate percentage error
percentage_error = (cv_rmse_mean / mean_target) * 100
percentage_error_std = (cv_rmse_std * 2 / mean_target) * 100

print(f"Model Performance (CV): ${cv_rmse_mean:,.2f} (+/- ${cv_rmse_std * 2:,.2f})")
print(f"Percentage Error (CV): {percentage_error:.2f}% (+/- {percentage_error_std:.2f}%)")
print(f"Target variable mean: ${mean_target:,.2f}")

Model Performance (CV): $35,433.85 (+/- $16,530.80)
Percentage Error (CV): 19.59% (+/- 9.14%)
Target variable mean: $180,921.20
