In [352]:
# import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import joblib




### Attention
#### If the code above hit an error, uncomment the line below and run it.
#### After it is run successfully, you can rerun the notebook from the beginning.

In [353]:
#!pip install pandas numpy matplotlib seaborn sklearn joblib

In [354]:
# Load data into memory.

df = pd.read_csv('House_price.csv')

# First five column
df.head()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location,SalePrice
0,2003,3,2,8450,Urban,208500
1,1976,3,2,9600,SubUrban,181500
2,2001,3,2,11250,Rural,223500
3,1915,3,1,9550,Urban,140000
4,2000,4,2,14260,SubUrban,250000


In [355]:
# checking dataset dataype and number of records
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   HouseAge   1460 non-null   int64 
 1   Bedroom    1460 non-null   int64 
 2   FullBath   1460 non-null   int64 
 3   LotArea    1460 non-null   int64 
 4   Location   1460 non-null   object
 5   SalePrice  1460 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 68.6+ KB


In [356]:
# current year
current_year  = datetime.now().year

# Replace the house age in years to number of years instead.
 # you can also do 2024 - df['HouseAge] instead.
 
df['HouseAge'] = current_year - df['HouseAge']
df.head()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location,SalePrice
0,21,3,2,8450,Urban,208500
1,48,3,2,9600,SubUrban,181500
2,23,3,2,11250,Rural,223500
3,109,3,1,9550,Urban,140000
4,24,4,2,14260,SubUrban,250000


# Data Splitting

In [357]:
# Splitting data into features and target.

X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [358]:
# Split dataset into train and test set.
# Train data will be used for model training while the test data will be used for evaluation.

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Preprocessing
Preprocessing is carried out to convert the categorical data to numbers and to scale the numerical data.

In [359]:
# numerical columns sp we can can scale them.
numerical_column = X_train.select_dtypes('number').columns
numerical_column

Index(['HouseAge', 'Bedroom', 'FullBath', 'LotArea'], dtype='object')

In [360]:
# Scale the numerical feartures using normalisation technique.

scaler =  StandardScaler()
X_train[numerical_column]= scaler.fit_transform(X_train[numerical_column])
X_train.head()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
1023,-1.107889,-1.112669,0.772872,-0.68395,SubUrban
810,-0.094543,0.128036,-1.062909,-0.054883,Urban
1384,1.049557,-1.112669,-1.062909,-0.152524,SubUrban
626,0.363097,0.128036,-1.062909,0.144198,SubUrban
813,0.428474,1.368742,-1.062909,-0.090142,SubUrban


In [361]:
# Convert the categorical variable(Location) to numbers using label encoder.

encoder = LabelEncoder()
X_train['Location'] = encoder.fit_transform(X_train['Location'])
X_train.head()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
1023,-1.107889,-1.112669,0.772872,-0.68395,1
810,-0.094543,0.128036,-1.062909,-0.054883,2
1384,1.049557,-1.112669,-1.062909,-0.152524,1
626,0.363097,0.128036,-1.062909,0.144198,1
813,0.428474,1.368742,-1.062909,-0.090142,1


#### Transform the test dataset with the encoder and scaler 
# Warning!
- Only run the next two cell once to get right result.
- if you run the second time, you might hit error. You'd have to rerun the notebook from the begining. 


In [362]:
X_test['Location'] = encoder.transform(X_test['Location'])
X_test.head()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
892,61,3,1,8414,2
1105,30,3,2,12256,1
413,97,2,1,8960,2
522,77,3,2,5000,2
1036,17,2,2,12898,2


In [363]:
X_test[numerical_column] = scaler.transform(X_test[numerical_column])
X_test.head()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
892,0.265031,0.128036,-1.062909,-0.210929,2
1105,-0.748315,0.128036,0.772872,0.136423,1
413,1.44182,-1.112669,-1.062909,-0.161565,2
522,0.788049,0.128036,0.772872,-0.519586,2
1036,-1.173266,-1.112669,0.772872,0.194466,2


# Training the Model


In [364]:
# Building a linear regression model.

lr = LinearRegression()
lr.fit(X_train, y_train)

In [365]:
# Building a decision tree model.

tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

In [366]:
# Building a random forest model.
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Make Prediction

In [369]:
# Make prediction with th random forest model.

prediction = rf.predict(X_test)
prediction[:5]

array([132375.16, 231606.  ,  97189.  , 141419.24, 347007.01])

# Model Evaluation
The model is evaluated with: 
- Mean absolute Error
- Root mean square error
- r2 score

In [371]:

# Find the mean absolute error
mean_absolute_error(prediction, y_test)

32321.226516416613

In [374]:
# Root mean square errror.

root_mean_squared_error(prediction, y_test)

47085.24235847942

In [375]:
# This is the accuracy of the model. It is measured in percentage.

r2_score(y_test, prediction)

0.6835228779497738

In [376]:
# Save model for deployment
joblib.dump(rf, 'model.joblib')

['model.joblib']

In [377]:
# Save the label encoder and scaler
# These will be used to transform the data in deploymetnt phase
joblib.dump(encoder, 'encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']