# üè† Use Case: Regression ‚Äì Predicting House Prices

## Import Libraries and Dataset

In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)
n_samples = 600

data_reg = pd.DataFrame({
    "size_sqft": np.random.normal(1600, 500, n_samples),
    "num_rooms": np.random.randint(2, 8, n_samples),
    "age_years": np.random.randint(1, 50, n_samples),
    "location": np.random.choice(["urban", "suburban", "rural"], n_samples),
    "condition": np.random.choice(["poor", "average", "good"], n_samples)
})

# Target: price
data_reg["price"] = (
    50000 + data_reg["size_sqft"]*120 + data_reg["num_rooms"]*5000 
    - data_reg["age_years"]*800 + np.where(data_reg["location"]=="urban", 30000, 0)
    + np.where(data_reg["condition"]=="good", 20000, 0)
    + np.random.normal(0, 20000, n_samples)
)

# Introduce missing values
for col in ["size_sqft", "num_rooms", "location", "condition"]:
    data_reg.loc[data_reg.sample(frac=0.1).index, col] = np.nan

X_reg = data_reg.drop("price", axis=1)
y_reg = data_reg["price"]


## Data Visualisation

In [15]:
# General Information about the DataSet
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   size_sqft  540 non-null    float64
 1   num_rooms  540 non-null    float64
 2   age_years  600 non-null    int64  
 3   location   540 non-null    object 
 4   condition  540 non-null    object 
 5   price      600 non-null    float64
dtypes: float64(3), int64(1), object(2)
memory usage: 28.3+ KB


In [14]:
# Analysing Missing Values
data_reg.isnull().sum()

size_sqft    60
num_rooms    60
age_years     0
location     60
condition    60
price         0
dtype: int64

# Step 2: Pipeline + Models

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

preprocessor_reg = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), ["size_sqft", "num_rooms", "age_years"]),
    ("nom", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]), ["location"]),
    ("ord", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ordenc", OrdinalEncoder(categories=[["poor","average","good"]]))]), ["condition"])
])

models_reg = {
    "linreg": LinearRegression(),
    "ridge": Ridge(),
    "rf": RandomForestRegressor(),
    "gb": GradientBoostingRegressor(),
    "svr": SVR(),
    "dt": DecisionTreeRegressor()
}


## Step 3: GridSearch + Evaluation

In [11]:
from sklearn.metrics import mean_squared_error, r2_score

results_reg = {}
for name, model in models_reg.items():
    pipe = Pipeline([("preprocessor", preprocessor_reg), ("reg", model)])
    grid = GridSearchCV(pipe, param_grid={}, cv=5, scoring="r2")
    grid.fit(X_train_reg, y_train_reg)
    y_pred = grid.predict(X_test_reg)
    results_reg[name] = {
        "best_score": grid.best_score_,
        "mse": mean_squared_error(y_test_reg, y_pred),
        "r2": r2_score(y_test_reg, y_pred)
    }

print(results_reg["rf"])  # Example: Random Forest results


{'best_score': np.float64(0.7685159453641782), 'mse': 911176455.8114637, 'r2': 0.7694278051057468}
