In [12]:
# Load processed data from EDA (already imputed + one-hot encoded)
import pandas as pd
from pathlib import Path

TRAIN_DIR = Path("../data/train")
TEST_DIR  = Path("../data/test")

X_train = pd.read_csv(TRAIN_DIR / "train_features_processed.csv")
y_train = pd.read_csv(TRAIN_DIR / "train_labels.csv").squeeze("columns")
X_test  = pd.read_csv(TEST_DIR  / "test_features_processed.csv")
y_test  = pd.read_csv(TEST_DIR  / "test_labels.csv").squeeze("columns")

print("X_train:", X_train.shape, "X_test:", X_test.shape)


X_train: (16512, 12) X_test: (4128, 12)


In [13]:
import numpy as np
import pandas as pd

# 1) Show shapes quickly
print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("y_train:", y_train.shape, "y_test:", y_test.shape)

# 2) Replace inf/-inf with NaN (can appear from divisions)
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test  = X_test.replace([np.inf, -np.inf], np.nan)

# 3) Force all columns numeric (coerce non-numeric to NaN)
X_train = X_train.apply(pd.to_numeric, errors="coerce")
X_test  = X_test.apply(pd.to_numeric,  errors="coerce")

# 4) Fill any remaining NaNs in FEATURES using TRAIN medians
med = X_train.median(numeric_only=True)
X_train = X_train.fillna(med)
X_test  = X_test.fillna(med)  # use train medians

# 5) Ensure labels are numeric and valid
y_train = pd.to_numeric(y_train, errors="coerce")
y_test  = pd.to_numeric(y_test,  errors="coerce")

# Drop rows where y_train is NaN or infinite (should be none, but be safe)
good = y_train.notna() & np.isfinite(y_train.to_numpy())
X_train = X_train.loc[good].reset_index(drop=True)
y_train = y_train.loc[good].reset_index(drop=True)

print("After cleaning → X_train:", X_train.shape, "y_train:", y_train.shape)
print("NaNs left (features):", int(X_train.isna().sum().sum()), int(X_test.isna().sum().sum()))


X_train: (16512, 12) X_test: (4128, 12)
y_train: (16512,) y_test: (4128,)
After cleaning → X_train: (16512, 12) y_train: (16512,)
NaNs left (features): 0 0


In [14]:
print("Per-column NaNs (train):")
print(X_train.isna().sum().sort_values(ascending=False).head(20))
print("\nDtypes:")
print(X_train.dtypes)


Per-column NaNs (train):
longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
dtype: int64

Dtypes:
longitude                     float64
latitude                      float64
housing_median_age            float64
total_rooms                   float64
total_bedrooms                float64
population                    float64
households                    float64
median_income                 float64
ocean_proximity_INLAND           bool
ocean_proximity_ISLAND           bool
ocean_proximity_NEAR BAY         bool
ocean_proximity_NEAR OCEAN       bool
dtype: object


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import joblib
import pandas as pd

# ensure labels are 1-D numeric (sometimes they load as DataFrame)
y_train = pd.to_numeric(y_train, errors="coerce").to_numpy().ravel()
y_test  = pd.to_numeric(y_test,  errors="coerce").to_numpy().ravel()

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

# Robust RMSE/R2 printing (no squared= kw)
rmse = np.sqrt(mean_squared_error(y_test, pred))
r2   = r2_score(y_test, pred)
print("LR  Test RMSE:", rmse)
print("LR  Test R²  :", r2)

# save model + predictions (same naming as your file)
joblib.dump(pipe, "linear_regression.joblib")
pd.DataFrame({"y_true": y_test, "y_pred": pred}).to_csv("linear_regression_predictions.csv", index=False)


LR  Test RMSE: 71002.83776920449
LR  Test R²  : 0.6233813507638213
