### How to train and test simple regression models

1. import libraries and modules
2. import datasets
3. pre-process dataset (cleaning missing data, encoding, train/test splitting)
4. train models
5. test models

In [2]:
import pandas as pd

In [4]:
house = pd.read_csv('HousePriceExample.csv')
house

Unnamed: 0,Size,Rooms,Price,Unnamed: 3
0,100,2,137.556462,
1,110,2,147.266947,
2,210,3,277.823882,
3,150,2,201.293308,
4,170,3,228.43209,
5,101,2,136.208504,
6,120,2,160.248469,
7,160,3,216.177111,
8,180,3,241.515656,
9,250,3,331.809213,


In [6]:
house.drop('Unnamed: 3', axis=1, inplace=True)
house

Unnamed: 0,Size,Rooms,Price
0,100,2,137.556462
1,110,2,147.266947
2,210,3,277.823882
3,150,2,201.293308
4,170,3,228.43209
5,101,2,136.208504
6,120,2,160.248469
7,160,3,216.177111
8,180,3,241.515656
9,250,3,331.809213


In [7]:
house.describe()

Unnamed: 0,Size,Rooms,Price
count,20.0,20.0,20.0
mean,159.25,2.35,213.261857
std,42.077466,0.587143,55.163494
min,95.0,1.0,128.432274
25%,129.75,2.0,175.073851
50%,160.0,2.0,213.957543
75%,185.0,3.0,247.285431
max,250.0,3.0,331.809213


### Import models and model selection module

In [8]:
# import 3 models to compare
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor #multi layer perceptron / deep learning
from sklearn.ensemble import RandomForestRegressor

# import training and testing module
from sklearn.model_selection import train_test_split

### Instantiate models

In [9]:
modelLR = LinearRegression() # default hyperparameters
modelNN = MLPRegressor()
modelRF = RandomForestRegressor()

### Train test split

In [12]:
# first need to split dataset into input (X) and output (y)
X = house.iloc[:, 0:2]
y = house.iloc[:, 2]

# then split the dataset into training and testing
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y, test_size=0.2, random_state=1)

### Train models

In [13]:
modelLR.fit(Xtrain, Ytrain)
modelNN.fit(Xtrain, Ytrain)
modelRF.fit(Xtrain, Ytrain)

In [14]:
# Rsquared-score
# measure the error in prediction and then compare models with each other based on their generalizability

print(modelLR.score(Xtest, Ytest))
print(modelNN.score(Xtest, Ytest))
print(modelRF.score(Xtest, Ytest))

0.9992331749057306
0.9982134534872121
0.9823012003912263


### Import sklearn metrics

In [15]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

LRpred = modelLR.predict(Xtest)
NNpred = modelLR.predict(Xtest)
RFpred = modelLR.predict(Xtest)

print(mean_squared_error(LRpred, Ytest))
print(mean_squared_error(NNpred, Ytest))
print(mean_squared_error(RFpred, Ytest))

1.0907409576754847
1.0907409576754847
1.0907409576754847


In [16]:
print(mean_absolute_error(LRpred, Ytest))
print(mean_absolute_error(NNpred, Ytest))
print(mean_absolute_error(RFpred, Ytest))

0.8757389110900746
0.8757389110900746
0.8757389110900746


In [17]:
print(r2_score(LRpred, Ytest))
print(r2_score(NNpred, Ytest))
print(r2_score(RFpred, Ytest))

0.9992238204864742
0.9992238204864742
0.9992238204864742
