### Multiple Linear Regression Practice

In [20]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [21]:
# importing dataset

dataset = pd.read_csv('50_Startups.csv')

In [22]:
# checking dataset head

dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [23]:
# checking shape

dataset.shape

(50, 5)

In [24]:
# creating X and y

X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [25]:
# Encoding categorical variable

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [26]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

In [27]:
# create train and test split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=123)

In [31]:
# training our model

multiple_regressor = LinearRegression()
multiple_regressor.fit(X_train,y_train)

LinearRegression()

In [40]:
# predicting on test dataset

y_pred = multiple_regressor.predict(X_test)
np.set_printoptions(precision=2)

In [51]:
np.concatenate((y_test.reshape(len(y_pred),1),y_pred.reshape(len(y_pred),1)),1)

array([[146121.95, 133749.92],
       [134307.35, 126771.56],
       [ 99937.59,  97712.5 ],
       [ 49490.75,  58138.83],
       [124266.9 , 128196.54],
       [192261.83, 192274.04],
       [ 78239.91,  75126.75],
       [141585.52, 127984.52],
       [101004.64, 101453.66],
       [152211.77, 151532.51]])

In [37]:
# calculating Rsquare

metrics.r2_score(y_test,y_pred)

0.9667998486973787

In [34]:
# checking coefficients

multiple_regressor.coef_

array([-1.45457402e+02, -4.15262719e+02,  5.60720121e+02,  7.75267940e-01,
       -1.64465805e-02,  3.62733426e-02])

In [35]:
# checking intercepts

multiple_regressor.intercept_

48661.69989652702

In [53]:
# making single prediction

multiple_regressor.predict([[1,0,0,160000,130000,300000]])

array([181303.06])

In [57]:
# Calculating predicition manually

y_manual = 48661.69989652702 + (-1.45457402e+02*1)+ (7.75267940e-01 * 160000) + (-1.64465805e-02*130000) + (3.62733426e-02 *300000)

In [58]:
y_manual

181303.06020952703