In [7]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
data = pd.read_csv('../Data/housing.csv')

### Extracting required data from the dataset

In [8]:
data = pd.get_dummies(data, columns=["ocean_proximity"])
data["bedrooms_by_totalrooms"] = data["total_bedrooms"]/data["total_rooms"]
data = data[data["median_house_value"] < 500001] # to remove all capped data
data = data.dropna(subset=["total_bedrooms"])  #dropping all NA values

# Refer EDA notebook for the reasoning behind choosing these features

X = data[["median_income", "ocean_proximity_<1H OCEAN", "ocean_proximity_INLAND", "bedrooms_by_totalrooms"]]
y = data["median_house_value"]
X = X.fillna(X.mean())
y = y.fillna(y.mean())

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
means = X_train.mean()
stds = X_train.std()

X_train_scaled = (X_train - means) / stds
X_test_scaled = (X_test - means) / stds
X_train_scaled.head()

Unnamed: 0,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,bedrooms_by_totalrooms
5148,-1.606564,1.137598,-0.704397,0.958457
1378,-0.013762,-0.878989,-0.704397,-0.416681
10567,0.282519,1.137598,-0.704397,-0.13461
16253,-0.709042,-0.878989,1.419563,-0.341646
3920,0.319714,1.137598,-0.704397,-0.272353


### Linear Regression Model

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

start = time.perf_counter()
model.fit(X_train_scaled, y_train)
fitting_time = time.perf_counter() - start

print(f"Time taken to fit is : {fitting_time}")

Time taken to fit is : 0.006048391000149422


In [11]:
y_pred = model.predict(X_test_scaled)

In [12]:
print("RMSE:", root_mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

RMSE: 63837.68725044186
R² Score: 0.5792233480491736
MAE: 46669.767377054966
