**Multiple Linear Regression - Scratch**

**Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import Libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math
import time
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, mean_absolute_error, explained_variance_score

**Load Dataset**

In [3]:
data = pd.read_csv('/content/drive/MyDrive/dataset/50_Startups.csv')
print(data.shape)
data.head(5)

(50, 5)


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 4].values

In [5]:
print(X[:5])
print(y[:5])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]
[192261.83 191792.06 191050.39 182901.99 166187.94]


In [6]:
# Encoding the categorical data
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
print(X[:5])
# Apply OneHotEncoder and avoid the dummy variable trap by skipping the first column of the encoded data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [3])], remainder='passthrough')
X = ct.fit_transform(X).astype(float)
print(X[:5])

[[165349.2 136897.8 471784.1 2]
 [162597.7 151377.59 443898.53 0]
 [153441.51 101145.55 407934.54 1]
 [144372.41 118671.85 383199.62 2]
 [142107.34 91391.77 366168.42 1]]
[[0.0000000e+00 1.0000000e+00 1.6534920e+05 1.3689780e+05 4.7178410e+05]
 [0.0000000e+00 0.0000000e+00 1.6259770e+05 1.5137759e+05 4.4389853e+05]
 [1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05 4.0793454e+05]
 [0.0000000e+00 1.0000000e+00 1.4437241e+05 1.1867185e+05 3.8319962e+05]
 [1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04 3.6616842e+05]]


In [7]:
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
print(X[:5])

[[1.0000000e+00 1.6534920e+05 1.3689780e+05 4.7178410e+05]
 [0.0000000e+00 1.6259770e+05 1.5137759e+05 4.4389853e+05]
 [0.0000000e+00 1.5344151e+05 1.0114555e+05 4.0793454e+05]
 [1.0000000e+00 1.4437241e+05 1.1867185e+05 3.8319962e+05]
 [0.0000000e+00 1.4210734e+05 9.1391770e+04 3.6616842e+05]]


##**Using Sklearn's Linear Regression**

In [8]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40, 4), (10, 4), (40,), (10,))

In [9]:
# Fitting Multiple Linear Regression to the Training set
reg = LinearRegression()
reg.fit(X_train, y_train)

In [10]:
# Predicting the Test set results
y_pred = reg.predict(X_test)
print(y_pred)

[103615.70496732 132245.69745432 133070.23906339  72592.46097845
 179075.96157176 116014.3380813   67853.79186105  98837.47482921
 114480.26282341 168492.58649243]


In [11]:
r2_score(y_test,y_pred)

0.9367033175940502

In [12]:
reg.coef_

array([1.05221988e+03, 7.74682581e-01, 3.18348683e-02, 3.57342438e-02])

In [13]:
reg.intercept_

42410.30908342115

##**Making our own Multiple Linear Regression**

In [14]:
class MultipleLinearRegressionScratch:

    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self,X_train,y_train):
        X_train = np.insert(X_train,0,1,axis=1)

        # calcuate the coeffs
        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]

    def predict(self,X_test):
        y_pred = np.dot(X_test,self.coef_) + self.intercept_
        return y_pred

In [15]:
lr = MultipleLinearRegressionScratch()
lr.fit(X_train,y_train)

In [16]:
X_train.shape

(40, 4)

In [17]:
np.insert(X_train,0,1,axis=1).shape

(40, 5)

In [18]:
y_pred = lr.predict(X_test)
y_pred

array([103615.70496732, 132245.69745432, 133070.23906339,  72592.46097845,
       179075.96157176, 116014.3380813 ,  67853.79186105,  98837.47482921,
       114480.26282341, 168492.58649243])

In [19]:
r2_score(y_test,y_pred)

0.9367033175940517

In [20]:
lr.coef_

array([1.05221988e+03, 7.74682581e-01, 3.18348683e-02, 3.57342438e-02])

In [21]:
lr.intercept_

42410.309083420965