In [86]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

In [164]:
# loading data
data = pd.read_csv("../data/training_data.csv")

# seperating features and labels
X_train = data.drop("median_house_value", axis=1)
y_train = data.median_house_value

# feature selection
# X_train.drop(["population", "average_occupancy"], axis=1, inplace=True)

# normalizing X_train
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
# flattening y_train
y_train = np.ravel(y_train)

<br>

### Linear Regression

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [32]:
def get_performance(data: np.ndarray | float) -> None:
    rmse = np.sqrt(-data)
    print("Mean:", "{:.2f}".format(rmse.mean()))
    print("Standard Deviation:", "{:.2f}".format(rmse.std()))

In [165]:
linear_regression = LinearRegression()

In [210]:
linear_regression.fit(X_train, y_train)

In [167]:
predictions = linear_regression.predict(X_train)
mse = mean_squared_error(y_train, predictions)
round(np.sqrt(mse), 2)

0.72

In [168]:
scores = cross_val_score(linear_regression, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
get_performance(scores)

Mean: 0.72
Standard Deviation: 0.03


In [209]:
columns = data.columns
list(zip(columns, linear_regression.coef_))

[('median_income', 0.8262950855149477),
 ('house_age', 0.11748786771499863),
 ('average_rooms', -0.2664715054872695),
 ('average_bedrooms', 0.31099820160093344),
 ('population', -0.00716698862950769),
 ('average_occupancy', -0.03931045918319836),
 ('latitude', -0.9011455266724576),
 ('longitude', -0.8699127110461763)]

<br>

### Decision Tree Regressor

In [162]:
from sklearn.tree import DecisionTreeRegressor

In [199]:
decision_tree = DecisionTreeRegressor(max_depth=7)

In [200]:
decision_tree.fit(X_train, y_train)

In [201]:
predictions = decision_tree.predict(X_train)
mse = mean_squared_error(y_train, predictions)
round(np.sqrt(mse), 2)

0.62

In [202]:
scores = cross_val_score(decision_tree, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
get_performance(scores)

Mean: 0.66
Standard Deviation: 0.01


In [205]:
list(zip(columns, decision_tree.feature_importances_))

[('median_income', 0.7002729689500183),
 ('house_age', 0.04926214742380715),
 ('average_rooms', 0.028682662710963803),
 ('average_bedrooms', 0.0020735653544800736),
 ('population', 0.008922129335855997),
 ('average_occupancy', 0.13576930634380138),
 ('latitude', 0.04450462769184503),
 ('longitude', 0.03051259218922814)]