In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder                                               
from sklearn.compose import ColumnTransformer 

import joblib

#Loading Data
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

df = pd.read_csv("../data/housify-data.csv")

target = "price"

num_features = [c for c in ["latitude", "longitude", "bedrooms", "bathrooms", "sqft"] if c in df.columns]
cat_features = [c for c in ["city"] if c in df.columns]


# --- Basic dtype cleanup -----------------------------------------------------
# Coerce numerics; invalids become NaN (float dtype allows NaN)
for c in num_features: 
    df[c] = pd.to_numeric(df[c], errors="coerce")


for c in cat_features:
    df[c] = df[c].astype("object")


# --- Target, mask out rows without a valid target ----------------------------
y = pd.to_numeric(df[target], errors="coerce")

mask = y.notna() & (y>= 0)

y = y.loc[mask]

X = df.loc[mask,num_features + cat_features]


# --- Preprocess: impute + encode --------------------------------------------
numeric_transformer = Pipeline(steps=[ 
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant",fill_value ="Unknown")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer( 
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ],
    remainder="drop"
)

# --- Model pipeline ----------------------------------------------------------
pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# --- Train/test split --------------------------------------------------------
# Note: for regression there is no stratify; use random_state for reproducibility.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

known_cities = X_train["city"].unique().tolist()

X_test["city"] = X_test["city"].where(X_test["city"].isin(known_cities), "Unknown")

# --- Fit & evaluate ----------------------------------------------------------
pipe.fit(X_train, y_train)

#evaluating the model
y_pred = pipe.predict(X_test)

y_pred = np.clip(y_pred, 0, None)

mae  = mean_absolute_error(y_test, y_pred)

mse  = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

r2   = r2_score(y_test, y_pred) 

print("🔹 Linear Regression (Pipeline) Results")
print(f"MAE:  ${mae:,.0f}")
print(f"RMSE: ${rmse:,.0f}")
print(f"R²:   {r2:.3f}")

# --- Persist model -----------------------------------------------------------
joblib.dump(pipe, "../backend/models/linear_regression_model.pkl")


🔹 Linear Regression (Pipeline) Results
MAE:  $514,778
RMSE: $1,138,778
R²:   0.360


['../backend/models/linear_regression_model.pkl']

In [1]:
import sys
print(sys.executable)

c:\Users\chera\Documents\WEB DEV + ML\Housonify\venv\Scripts\python.exe
