In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
# Reading the cleaned data
df = pd.read_csv('model_dataset.csv')

In [3]:
# Dropping unwanted column
df = df.drop(['Unnamed: 0'], axis=1)

In [4]:
# Dummy variables creation
bed_room_dummies = pd.get_dummies(df['number_of_bedrooms'], prefix='bed_rm', drop_first=True)
bath_room_dummies = pd.get_dummies(df['numberofbathrooms_per_house'], prefix='bath_rm', drop_first=True)
floor_dummies = pd.get_dummies(df['floors'], prefix='flr', drop_first=True)
house_condition_dummies = pd.get_dummies(df['house_condition'], prefix='cond', drop_first=True)
housing_grade_dummies = pd.get_dummies(df['housing_grade'], prefix='grd', drop_first=True)
yr_built_dummies = pd.get_dummies(df['yr_built'], prefix='yr_b', drop_first=True)

In [5]:
# Joining the dataframe and dummyvariable
df = pd.concat([df, bath_room_dummies, housing_grade_dummies], axis=1)
df = df.drop(['numberofbathrooms_per_house','housing_grade','number_of_bedrooms','floors',
             'house_condition','yr_built'], axis=1)

In [6]:
# feature engineering
X = df.iloc[:,1:]
Y = df.iloc[:, 0]

In [7]:
#splitting data
print("-----  Splitting the data in train and test ----")
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

-----  Splitting the data in train and test ----


In [8]:
#adding the constant

X_train = sm.add_constant(X_train) # adding a constant
X_test = sm.add_constant(X_test) # adding a constant

In [9]:
#training the model
print("-----  Training the model ----")
model = sm.OLS(y_train, X_train).fit()
print_model = model.summary()

-----  Training the model ----


In [10]:
#predictions to check the model
print("-----  Evaluating the model ----")
predictions = model.predict(X_train)
err_train = np.sqrt(mean_squared_error(y_train, predictions))
predictions_test = model.predict(X_test)
err_test = np.sqrt(mean_squared_error(y_test, predictions_test))

-----  Evaluating the model ----


In [11]:
print(print_model)
print ("-------------")
print (f"RMSE on train data: {err_train}")
print (f"RMSE on test data: {err_test}")

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.661
Model:                            OLS   Adj. R-squared:                  0.660
Method:                 Least Squares   F-statistic:                     707.3
Date:                Thu, 15 Oct 2020   Prob (F-statistic):               0.00
Time:                        09:55:51   Log-Likelihood:            -1.8942e+05
No. Observations:               14197   AIC:                         3.789e+05
Df Residuals:                   14157   BIC:                         3.792e+05
Df Model:                          39                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                  -4.65