In [9]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
data = pd.read_csv('../Data/housing.csv')

### Extracting required data from the dataset

In [10]:
data = pd.get_dummies(data, columns=["ocean_proximity"])
data["bedrooms_by_totalrooms"] = data["total_bedrooms"]/data["total_rooms"]

# Refer EDA notebook for the reasoning behind choosing these features

X = data[["median_income", "ocean_proximity_<1H OCEAN", "ocean_proximity_INLAND", "bedrooms_by_totalrooms"]]
y = data["median_house_value"]
X = X.fillna(X.mean())
y = y.fillna(y.mean())

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
means = X_train.mean()
stds = X_train.std()

X_train_scaled = (X_train - means) / stds
X_test_scaled = (X_test - means) / stds
X_train_scaled.head()

Unnamed: 0,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,bedrooms_by_totalrooms
14196,-0.326186,-0.894656,-0.680554,-0.211778
8267,-0.035842,-0.894656,-0.680554,0.342175
17445,0.144697,-0.894656,-0.680554,-0.661638
14265,-1.017834,-0.894656,-0.680554,0.783008
2271,-0.171483,-0.894656,1.469303,-0.550347


### Linear Regression Model

In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

start = time.perf_counter()
model.fit(X_train_scaled, y_train)
fitting_time = time.perf_counter() - start

print(f"Time taken to fit is : {fitting_time}")

Time taken to fit is : 0.0023535919999631005


In [13]:
y_pred = model.predict(X_test_scaled)

In [14]:
print("RMSE:", root_mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

RMSE: 73811.24835311374
R² Score: 0.5842442722913426
MAE: 52786.49968426041
