In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading/reading the .csv file into the notebook
data = pd.read_csv("housing.csv")

In [None]:
data

In [None]:
data.info()

In [None]:
# makes sure that there are no non-null values missing and saves it to data (making sure all the non-null values are equal)
data.dropna(inplace = True)

In [None]:
data.info()


In [None]:
data

In [None]:
# spliting the data into a testing group and an evalutaion group so that we can use one set of data to train the model and the other to see how well it has performed
from sklearn.model_selection import train_test_split

# before splitting the data, we have to define x and y, because 'train_test_split' takes x and y and turns it into x train, y train, x test, and y test.\

# X is the all the data without the target variable ('median_house_value'), and Y only being the 'median_house_value'
X = data.drop(['median_house_value'], axis = 1)
y = data['median_house_value']

In [None]:
# splitting the data (20% of the data will be split for evaluation after we have trained and are confident that the model is ready to use)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# defining train data
train_data = X_train.join(y_train)

In [None]:
train_data

In [None]:
# creating a graph about all the data
train_data.hist(figsize = (15, 8))

In [None]:
# creating a heatmap of the data by using '.corr' which makes everything have a correlation to one another
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(), annot=True, cmap="YlGnBu")

In [None]:
# gives us a cleaner graph by messing with log
train_data['total_rooms'] = np.log(train_data['total_rooms'] + 1)
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms'] + 1)
train_data['population'] = np.log(train_data['population'] + 1)
train_data['households'] = np.log(train_data['households'] + 1)

In [None]:
# updated graph
train_data.hist(figsize=(15, 8))

In [None]:
# assigns certain values to 'ocean_proximity' so that we can use it in graphs (whilst also removing ocean proximity)
train_data = train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'], axis = 1)

In [None]:
# creating another heatmap including the 'ocean_proximity'
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(), annot=True, cmap="YlGnBu")

In [None]:
# creates a scatterplot that tells you the target variable ('median_house_value') based on the longitude and latitude coordinates.
plt.figure(figsize=(15, 8))
sns.scatterplot(x='latitude', y='longitude', data = train_data, hue = 'median_house_value', palette = 'coolwarm')

In [None]:
# using feature engineering to create new variables by combining other variables.
train_data['bedroom_ratio'] = train_data['total_bedrooms'] / train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms'] / train_data['households']

In [168]:
# training the model by using linear regression and also scaling the model (the simplest way to train a model)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# splitting the data again into training and testing data because we have to add the new features that we added
X_train, y_train = train_data.drop(['median_house_value'], axis = 1), train_data['median_house_value']
X_train_s = scaler.fit_transform(X_train)

reg = LinearRegression()

reg.fit(X_train_s, y_train)

In [None]:
# doing everything we did with the train data except we are doing it to the test data (we are now confident with testing the model)
test_data = X_test.join(y_test)

test_data['total_rooms'] = np.log(test_data['total_rooms'] + 1)
test_data['total_bedrooms'] = np.log(test_data['total_bedrooms'] + 1)
test_data['population'] = np.log(test_data['population'] + 1)
test_data['households'] = np.log(test_data['households'] + 1)

test_data = test_data.join(pd.get_dummies(test_data.ocean_proximity)).drop(['ocean_proximity'], axis = 1)

test_data['bedroom_ratio'] = test_data['total_bedrooms'] / test_data['total_rooms']
test_data['household_rooms'] = test_data['total_rooms'] / test_data['households']

In [161]:
X_test, y_test = test_data.drop(['median_house_value'], axis = 1), test_data['median_house_value']

In [169]:
X_test_s = scaler.transform(X_test)

In [170]:
reg.score(X_test_s, y_test)

-0.730743777413541

In [180]:
# another way to improve results
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()

forest.fit(X_train_s, y_train)

In [181]:
forest.score(X_test_s, y_test)

0.26645894591110764

In [184]:

from sklearn.model_selection import GridSearchCV

forest = RandomForestRegressor()

param_grid = {
    "n_estimators": [3, 10, 30],
    "max_features": [2, 4, 6, 8]
}

grid_search = GridSearchCV(forest, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)


grid_search.fit(X_train_s, y_train)

In [186]:
best_forest = grid_search.best_estimator_

In [187]:
best_forest.score(X_train_s, y_train)

0.9721227917932791