In [2]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Optional: XGBoost
try:
    from xgboost import XGBRegressor
    xgboost_available = True
except Exception:
    xgboost_available = False

sns.set(style="whitegrid")

# ---------------------------
# 1) Locate CSV (robust)
# ---------------------------
data_dir = "data"
if not os.path.exists(data_dir):
    raise SystemExit("Put dataset CSV into 'data/' or run kaggle download as described in README.")

csv_files = glob.glob(os.path.join(data_dir, "*.csv"))
if len(csv_files) == 0:
    raise SystemExit("No CSV files found in data/. Make sure dataset is downloaded and unzipped.")
print("Found CSV(s):", csv_files)

# If multiple CSVs, pick the first one (or change to exact filename)
df = pd.read_csv(csv_files[0])
print("Data shape:", df.shape)
display(df.head())

# ---------------------------
# 2) Quick EDA
# ---------------------------
print("\n--- Columns and dtypes ---")
print(df.dtypes)
print("\n--- Missing value counts ---")
print(df.isnull().sum().sort_values(ascending=False).head(20))
print("\n--- Basic stats ---")
display(df.describe(include='all').T)

# Identify target: try to detect common names
possible_targets = ["price", "Price", "SalePrice", "sale_price", "house_price", "House Price"]
target_col = None
for t in possible_targets:
    if t in df.columns:
        target_col = t
        break

if target_col is None:
    # if not obvious, ask the user which column is the price or try heuristics:
    # choose numeric column with 'price' substring
    for c in df.columns:
        if 'price' in c.lower():
            target_col = c
            break

if target_col is None:
    # fallback: if only one numeric column that looks like a value? We'll try to infer:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print("Numeric columns:", numeric_cols)
    raise SystemExit("Cannot automatically find target/price column. Please set `target_col` manually.")

print("Target column detected:", target_col)

# ---------------------------
# 3) Basic cleaning / preprocessing
# ---------------------------
# Drop rows with missing target
df = df.dropna(subset=[target_col]).reset_index(drop=True)

# Separate X/y
X = df.drop(columns=[target_col])
y = df[target_col].astype(float)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Identify numeric and categorical features
numeric_feats = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_feats = X_train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
print("Numeric features:", numeric_feats)
print("Categorical features:", categorical_feats)

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_feats),
    ("cat", categorical_transformer, categorical_feats)
], remainder="drop")  # drop any other columns not listed

# ---------------------------
# 4) Models to try
# ---------------------------
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}
if xgboost_available:
    models["XGBoost"] = XGBRegressor(n_estimators=200, random_state=42, n_jobs= -1, verbosity=0)

results = {}

# ---------------------------
# 5) Train, cross-validate, evaluate
# ---------------------------
for name, model in models.items():
    print(f"\n=== Model: {name} ===")
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    # cross-val (neg MSE), use RMSE via scores later
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    neg_mse_scores = cross_val_score(pipe, X_train, y_train, scoring="neg_mean_squared_error", cv=cv, n_jobs=-1)
    rmse_scores = np.sqrt(-neg_mse_scores)
    print(f"CV RMSE: mean={rmse_scores.mean():.3f}, std={rmse_scores.std():.3f}")

    # Fit on full train
    pipe.fit(X_train, y_train)

    # Predict on test
    y_pred = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"Test MAE: {mae:.3f}")
    print(f"Test RMSE: {rmse:.3f}")
    print(f"Test R2: {r2:.3f}")

    results[name] = {
        "pipeline": pipe,
        "cv_rmse_mean": rmse_scores.mean(),
        "cv_rmse_std": rmse_scores.std(),
        "test_mae": mae,
        "test_rmse": rmse,
        "test_r2": r2,
        "y_pred": y_pred
    }

# ---------------------------
# 6) Visualizations for best model (choose by test RMSE)
# ---------------------------
best_name = min(results.keys(), key=lambda k: results[k]["test_rmse"])
print("\nBest model by test RMSE:", best_name)
best_pipe = results[best_name]["pipeline"]
y_pred_best = results[best_name]["y_pred"]

# Predicted vs Actual
plt.figure(figsize=(7,7))
plt.scatter(y_test, y_pred_best, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title(f"{best_name}: Predicted vs Actual")
plt.tight_layout()
plt.show()

# Residuals histogram
residuals = y_test - y_pred_best
plt.figure(figsize=(7,4))
sns.histplot(residuals, kde=True)
plt.title("Residuals distribution")
plt.xlabel("Residual (Actual - Predicted)")
plt.tight_layout()
plt.show()

# Feature importances (if tree-based)
if hasattr(best_pipe.named_steps["model"], "feature_importances_"):
    # need the names after preprocessing (one-hot expands categories)
    # get numeric feature names
    num_names = numeric_feats
    # get categorical feature names after onehot
    ohe = best_pipe.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
    try:
        cat_names = ohe.get_feature_names_out(categorical_feats).tolist()
    except Exception:
        cat_names = []
    all_feature_names = num_names + cat_names

    importances = best_pipe.named_steps["model"].feature_importances_
    fi = pd.DataFrame({"feature": all_feature_names, "importance": importances})
    fi = fi.sort_values("importance", ascending=False).head(20)

    plt.figure(figsize=(8,6))
    sns.barplot(data=fi, x="importance", y="feature")
    plt.title(f"{best_name} feature importances (top 20)")
    plt.tight_layout()
    plt.show()

# Save the best pipeline
joblib.dump(best_pipe, "best_model_pipeline.joblib")
print("Saved best pipeline to best_model_pipeline.joblib")

SystemExit: Put dataset CSV into 'data/' or run kaggle download as described in README.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
