<h2>Multiple linear regression algorithm implementation using normal equations method</h2>

@author: Debidutta Dash

In [1]:
import csv
import random
import math
import numpy as np

In [2]:
def loadDataset(filename):
    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        headers = dataset[0]
        dataset = dataset[1: len(dataset)]
        return dataset, headers

In [3]:
dataset, headers = loadDataset('E:/temp/Real estate.csv')

print("HEADERS")
print(headers)
print()
print("DATASET")
display(dataset[0:5])
print()
print("Dataset Size")
print(len(dataset), "X", len(dataset[0]))


HEADERS
['No', 'X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude', 'Y house price of unit area']

DATASET


[['1', '2012.917', '32', '84.87882', '10', '24.98298', '121.54024', '37.9'],
 ['2', '2012.917', '19.5', '306.5947', '9', '24.98034', '121.53951', '42.2'],
 ['3', '2013.583', '13.3', '561.9845', '5', '24.98746', '121.54391', '47.3'],
 ['4', '2013.500', '13.3', '561.9845', '5', '24.98746', '121.54391', '54.8'],
 ['5', '2012.833', '5', '390.5684', '5', '24.97937', '121.54245', '43.1']]


Dataset Size
414 X 8


In [4]:
dataset = np.array(dataset)
dataset = dataset.astype(float)

X = dataset[:, 2:-1]                                     #taking columns with index 2 to 6 as features in X
Y = dataset[:, -1]                                       #taking the last column as label, i.e. 'price per unit area'



In [5]:
print("X head")
display(X[0:5])
print("Size of X")
print(X.shape)
print("\n\n")



print("Y head")
display(Y[0:5])
print("Size of Y")
print(Y.shape)

X head


array([[ 32.     ,  84.87882,  10.     ,  24.98298, 121.54024],
       [ 19.5    , 306.5947 ,   9.     ,  24.98034, 121.53951],
       [ 13.3    , 561.9845 ,   5.     ,  24.98746, 121.54391],
       [ 13.3    , 561.9845 ,   5.     ,  24.98746, 121.54391],
       [  5.     , 390.5684 ,   5.     ,  24.97937, 121.54245]])

Size of X
(414, 5)



Y head


array([37.9, 42.2, 47.3, 54.8, 43.1])

Size of Y
(414,)


In [6]:
#adding ones to X
one = np.ones((len(X),1))
X = np.append(one, X, axis=1)

In [7]:
#reshape Y to a column vector
Y = np.array(Y).reshape((len(Y),1))

In [8]:
print("X head")
display(X[0:5])
print("Size of X")
print(X.shape)
print("\n\n")



print("Y head")
display(Y[0:5])
print("Size of Y")
print(Y.shape)

X head


array([[  1.     ,  32.     ,  84.87882,  10.     ,  24.98298, 121.54024],
       [  1.     ,  19.5    , 306.5947 ,   9.     ,  24.98034, 121.53951],
       [  1.     ,  13.3    , 561.9845 ,   5.     ,  24.98746, 121.54391],
       [  1.     ,  13.3    , 561.9845 ,   5.     ,  24.98746, 121.54391],
       [  1.     ,   5.     , 390.5684 ,   5.     ,  24.97937, 121.54245]])

Size of X
(414, 6)



Y head


array([[37.9],
       [42.2],
       [47.3],
       [54.8],
       [43.1]])

Size of Y
(414, 1)


In [9]:
def train_test_split(X, Y, split):

    #randomly assigning split% rows to training set and rest to test set 
    indices = np.array(range(len(X)))
    
    train_size = round(split * len(X))

    random.shuffle(indices)

    train_indices = indices[0:train_size]
    test_indices = indices[train_size:len(X)]

    X_train = X[train_indices, :]
    X_test = X[test_indices, :]
    Y_train = Y[train_indices, :]
    Y_test = Y[test_indices, :]
    
    return X_train,Y_train, X_test, Y_test

In [10]:
split = 0.7
X_train, Y_train, X_test, Y_test = train_test_split(X, Y, split)

In [11]:
print ("TRAINING SET")
print("X_train.shape: ", X_train.shape)
print("Y_train.shape: ", Y_train.shape)

print()
print("TESTING SET")
print("X_test.shape: ", X_test.shape)
print("Y_test.shape: ", Y_test.shape)

TRAINING SET
X_train.shape:  (290, 6)
Y_train.shape:  (290, 1)

TESTING SET
X_test.shape:  (124, 6)
Y_test.shape:  (124, 1)


In [12]:
def cost_function(X, Y):
    beta = np.dot ( ( np.linalg.inv( np.dot( X.T, X ) ) ), np.dot( X.T, Y ) )

    return beta

In [13]:
def predict(X_test, beta):
    return np.dot(X_test, beta)

In [14]:
beta = cost_function(X_train, Y_train)
predictions = predict(X_test, beta)

print(predictions.shape)

(124, 1)


In [15]:
def metrix(predictions, Y_test):

    #calculating mean absolute error
    MAE = np.mean(np.abs(predictions-Y_test))

    #calculating root mean square error
    MSE = np.square(np.subtract(Y_test,predictions)).mean() 
    RMSE = math.sqrt(MSE)

    #calculating r_square
    rss = np.sum(np.square((Y_test- predictions)))
    mean = np.mean(Y_test)
    sst = np.sum(np.square(Y_test-mean))
    r_square = 1 - (rss/sst)
    

    return MAE, RMSE, r_square

In [16]:
mae, rmse, r_square = metrix(predictions, Y_test)
print("Mean Absolute Error: ", mae)
print("Root Mean Square Error: ", rmse)
print("R square: ", r_square)

Mean Absolute Error:  5.484897442452742
Root Mean Square Error:  7.038888303432659
R square:  0.6927055239131656


<h3>References:</h3>

Theory behind the algorithm: http://cs229.stanford.edu/notes2020spring/cs229-notes1.pdf
<br>
For implementing R square: https://www.geeksforgeeks.org/python-coefficient-of-determination-r2-score/