In [36]:
# importing the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

In [37]:
# Load our data set
df = pd.read_csv("house_data.csv")

df.head()

Unnamed: 0,sq_feet,num_bedrooms,num_bathrooms,sale_price
0,785,2,2,170461
1,1477,2,2,271651
2,712,1,1,139912
3,3233,3,3,603246
4,1581,2,1,278603


In [38]:
# Create the X and y arrays
X = df[["sq_feet", "num_bedrooms", "num_bathrooms"]]
y = df["sale_price"]

# Split the data set in a training set (75%) and a test set (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [39]:
# Create the Linear Regression model
model = LinearRegression()

# Create the GD Boost Regression model
# model = GradientBoostingRegressor() # we can use GDboost model for complex problems

# Train the model
model.fit(X_train, y_train)

# Save the trained model to a file so we can use it to make predictions later
joblib.dump(model, 'house_value_model.pkl')

['house_value_model.pkl']

In [40]:
# Report how well the model is performing
print("Model training results:")

# Report an error rate on the training set
mse_train = mean_absolute_error(y_train, model.predict(X_train))
print(f" - Training Set Error: {mse_train}")

# Report an error rate on the test set
mse_test = mean_absolute_error(y_test, model.predict(X_test))
print(f" - Test Set Error: {mse_test}")

Model training results:
 - Training Set Error: 9157.711097431024
 - Test Set Error: 8908.839384796061


In [30]:
# The above results means that the model can predict the values of any house in the training data with an
# average error of +- $9096. For the test data also the error is similar, so model is working fine.