In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_csv('housing_cleaned.csv')
df.columns

Index(['Unnamed: 0', 'longitude', 'latitude', 'housing_median_age',
       'total_rooms', 'total_bedrooms', 'population', 'households',
       'median_income', 'median_house_value', 'ocean_proximity'],
      dtype='object')

In [4]:
# We don't want latitude and longitude
df_model = df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income',
 'median_house_value', 'ocean_proximity']]

In [5]:
dummy = pd.get_dummies(df_model)

In [15]:
# We have to split our data into training data and test data
from sklearn.model_selection import train_test_split

X = dummy.drop('median_house_value', axis=1)
y = dummy.median_house_value.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2, random_state=1)

In [8]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.12.0-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 3.4 MB/s eta 0:00:01
Collecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 36.5 MB/s eta 0:00:01
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.0
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


In [16]:
import statsmodels.api as sm

X_sm = X = sm.add_constant(X)
model = sm.OLS(y, X_sm)
model.fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.634
Model:,OLS,Adj. R-squared:,0.634
Method:,Least Squares,F-statistic:,3570.0
Date:,"Sun, 06 Sep 2020",Prob (F-statistic):,0.0
Time:,17:22:06,Log-Likelihood:,-259500.0
No. Observations:,20640,AIC:,519000.0
Df Residuals:,20629,BIC:,519100.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.44e+04,5555.690,7.993,0.000,3.35e+04,5.53e+04
housing_median_age,1170.4825,44.193,26.486,0.000,1083.861,1257.104
total_rooms,-6.2773,0.779,-8.056,0.000,-7.805,-4.750
total_bedrooms,57.4326,5.985,9.597,0.000,45.702,69.163
population,-38.1254,1.075,-35.459,0.000,-40.233,-36.018
households,99.7270,6.698,14.889,0.000,86.598,112.856
median_income,4.003e+04,333.738,119.942,0.000,3.94e+04,4.07e+04
ocean_proximity_<1H OCEAN,-1.643e+04,5279.552,-3.113,0.002,-2.68e+04,-6084.549
ocean_proximity_INLAND,-8.46e+04,5271.505,-16.048,0.000,-9.49e+04,-7.43e+04

0,1,2,3
Omnibus:,4990.754,Durbin-Watson:,0.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17791.441
Skew:,1.191,Prob(JB):,0.0
Kurtosis:,6.875,Cond. No.,1.71e+19


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

linreg = LinearRegression()
linreg.fit(X_train,y_train)

np.mean(cross_val_score(linreg, X_train, y_train, scoring = 'neg_mean_absolute_error', cv=3))

-50608.21113768435

In [21]:
from sklearn.linear_model import Lasso
lass = Lasso(alpha=1.0)
lass.fit(X_train, y_train)
np.mean(cross_val_score(lass, X_train, y_train, scoring = 'neg_mean_absolute_error', cv=3))

  positive)
  positive)
  positive)
  positive)


-50609.514676399966

In [33]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
np.mean(cross_val_score(rf, X_train, y_train, scoring = 'neg_mean_absolute_error', cv=3))

-43575.25937317583

#### We see that the mean error has reduced considerably using Random forest

In [35]:
# predictions
pred_linreg = linreg.predict(X_test)
pred_lass = lass.predict(X_test)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

In [37]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, pred_linreg))
print(mean_absolute_error(y_test, pred_lass))
print(mean_absolute_error(y_test, pred_rf))

67323.631805632
67324.43688826
16902.45000000001


#### Random forest works the best for this dataset. Its mean absolute error is almost 6 times lower than the other two models, so it gives the best predictions.

In [39]:
# Best prediction
print("Best predicted house price is: $" + str(pred_rf[0]))

Best predicted house price is: $322269.1
