<a href="https://colab.research.google.com/github/eshkrish2/forecasting-of-house-prices/blob/main/FORECASTING_OF_HOUSE_PRICES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset (You can replace this with your CSV)
def load_data():
    url = "https://raw.githubusercontent.com/selva86/datasets/master/AmesHousing.csv"
    return pd.read_csv(url)

# Preprocessing
def preprocess_data(data):
    X = data.drop("SalePrice", axis=1)
    y = data["SalePrice"]

    numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
    categorical_features = X.select_dtypes(include=["object"]).columns

    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    return preprocessor, X, y

# Train and evaluate models
def train_models(X_train, X_test, y_train, y_test, preprocessor):
    models = {
        "Linear Regression": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "Lasso": Lasso(alpha=0.1),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
        "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    }

    results = []

    for name, model in models.items():
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("regressor", model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        results.append((name, rmse, r2))
        print(f"{name} -> RMSE: {rmse:.2f}, R2: {r2:.4f}")

    return results

# Visualize model comparison
def plot_results(results):
    df = pd.DataFrame(results, columns=["Model", "RMSE", "R2"])
    df = df.sort_values("RMSE")

    plt.figure(figsize=(10, 5))
    sns.barplot(x="RMSE", y="Model", data=df, palette="viridis")
    plt.title("Model Comparison - RMSE")
    plt.xlabel("Root Mean Squared Error")
    plt.show()

# Main execution
def main():
    print("Loading data...")
    data = load_data()

    print("Preprocessing...")
    preprocessor, X, y = preprocess_data_
