# Multiple Linear Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df=pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, 4].values

In [4]:
X[:5]



array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

In [5]:
Y[:5]

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94])

In [6]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder #FOR HANDLING CATEGORICAL VARIABLES
le_X = LabelEncoder()
X[:,3]=le_X.fit_transform(X[:,3])
X=X.astype(int)
ohe=OneHotEncoder(categorical_features=[3])
X=ohe.fit_transform(X).toarray()
X=X.astype(int)



In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
#Avoiding the dummy variable trap
X=X[:, 1:]
X[:5]

array([[     0,      1, 165349, 136897, 471784],
       [     0,      0, 162597, 151377, 443898],
       [     1,      0, 153441, 101145, 407934],
       [     0,      1, 144372, 118671, 383199],
       [     1,      0, 142107,  91391, 366168]])

In [8]:
#Splitting the data sets into traning set and test set
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train ,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(37, 5)
(13, 5)
(37,)
(13,)


In [10]:
from sklearn.linear_model import LinearRegression
mr = LinearRegression()

In [11]:
mr.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
y_pred = mr.predict(X_test)
y_pred[:5]

array([104440.65610878, 132253.47468144, 132872.34077361,  71707.35647909,
       178678.95215752])

In [13]:
Y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94,  96778.92,  96479.51,
       105733.54])

In [14]:
#conda install -c conda-forge statsmodels

In [15]:
#BUILDING AN OPTIMAL MODEL USING BACKWARD ELIMINATION
import statsmodels.api as sm
X=np.append(arr=np.ones((50,1)).astype(int),values=X,axis=1)
X[:5]

array([[     1,      0,      1, 165349, 136897, 471784],
       [     1,      0,      0, 162597, 151377, 443898],
       [     1,      1,      0, 153441, 101145, 407934],
       [     1,      0,      1, 144372, 118671, 383199],
       [     1,      1,      0, 142107,  91391, 366168]])

In [28]:
X_opt = X[:,[0,3]]
#OrdinaryLeastSquares
regressor_OLS = sm.OLS(endog=Y, exog=X_opt).fit()

In [29]:
X_opt = X[:,[0,3]]
#OrdinaryLeastSquares
regressor_OLS = sm.OLS(endog=Y, exog=X_opt).fit()

array([[     1, 165349],
       [     1, 162597],
       [     1, 153441],
       [     1, 144372],
       [     1, 142107]])

In [30]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Fri, 13 Mar 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,03:46:45,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.900,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.538
Skew:,-0.911,Prob(JB):,9.43e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [41]:
import statsmodels.api as sm
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, X).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)