In [1]:
# Multiple Linear Regression

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [5]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
# Avoiding the Dummy Variable Trap
X = X[:, 1:]

In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [8]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [10]:
# Import the sklearn libraries
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)    # Coefficient of determination

-4.391067777212705

In [14]:
# Building a optimal model 
import statsmodels.api as sm
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)

In [15]:
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.187
Model:,OLS,Adj. R-squared:,0.115
Method:,Least Squares,F-statistic:,2.584
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.0496
Time:,18:27:26,Log-Likelihood:,-595.48
No. Observations:,50,AIC:,1201.0
Df Residuals:,45,BIC:,1211.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.849e+04,2796.080,20.917,0.000,5.29e+04,6.41e+04
x1,5.849e+04,2796.080,20.917,0.000,5.29e+04,6.41e+04
x2,-8.13e+04,3.83e+04,-2.121,0.039,-1.59e+05,-4083.603
x3,-5.205e+04,3.83e+04,-1.358,0.181,-1.29e+05,2.52e+04
x4,-6.748e+04,3.83e+04,-1.760,0.085,-1.45e+05,9733.737
x5,-4.721e+04,3.83e+04,-1.232,0.225,-1.24e+05,3e+04

0,1,2,3
Omnibus:,1.208,Durbin-Watson:,0.46
Prob(Omnibus):,0.547,Jarque-Bera (JB):,0.481
Skew:,-0.012,Prob(JB):,0.786
Kurtosis:,3.48,Cond. No.,5.18e+16


In [16]:
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.047
Method:,Least Squares,F-statistic:,1.809
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.159
Time:,18:27:45,Log-Likelihood:,-597.87
No. Observations:,50,AIC:,1204.0
Df Residuals:,46,BIC:,1211.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.762e+04,2869.394,20.082,0.000,5.18e+04,6.34e+04
x1,5.762e+04,2869.394,20.082,0.000,5.18e+04,6.34e+04
x2,-5.032e+04,3.98e+04,-1.266,0.212,-1.3e+05,2.97e+04
x3,-6.575e+04,3.98e+04,-1.654,0.105,-1.46e+05,1.43e+04
x4,-4.548e+04,3.98e+04,-1.144,0.259,-1.26e+05,3.45e+04

0,1,2,3
Omnibus:,0.765,Durbin-Watson:,0.19
Prob(Omnibus):,0.682,Jarque-Bera (JB):,0.209
Skew:,-0.073,Prob(JB):,0.901
Kurtosis:,3.281,Cond. No.,4.72e+16


In [17]:
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.047
Method:,Least Squares,F-statistic:,1.809
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.159
Time:,18:27:58,Log-Likelihood:,-597.87
No. Observations:,50,AIC:,1204.0
Df Residuals:,46,BIC:,1211.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.152e+05,5738.788,20.082,0.000,1.04e+05,1.27e+05
x1,-5.032e+04,3.98e+04,-1.266,0.212,-1.3e+05,2.97e+04
x2,-6.575e+04,3.98e+04,-1.654,0.105,-1.46e+05,1.43e+04
x3,-4.548e+04,3.98e+04,-1.144,0.259,-1.26e+05,3.45e+04

0,1,2,3
Omnibus:,0.765,Durbin-Watson:,0.19
Prob(Omnibus):,0.682,Jarque-Bera (JB):,0.209
Skew:,-0.073,Prob(JB):,0.901
Kurtosis:,3.281,Cond. No.,7.3


In [18]:
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.052
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,1.299
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.283
Time:,18:28:12,Log-Likelihood:,-599.31
No. Observations:,50,AIC:,1205.0
Df Residuals:,47,BIC:,1210.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.139e+05,5782.556,19.693,0.000,1.02e+05,1.26e+05
x1,-4.895e+04,4.05e+04,-1.209,0.233,-1.3e+05,3.25e+04
x2,-4.411e+04,4.05e+04,-1.090,0.281,-1.26e+05,3.73e+04

0,1,2,3
Omnibus:,0.323,Durbin-Watson:,0.159
Prob(Omnibus):,0.851,Jarque-Bera (JB):,0.039
Skew:,-0.058,Prob(JB):,0.981
Kurtosis:,3.072,Cond. No.,7.22


In [19]:
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,1.404
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.242
Time:,18:28:22,Log-Likelihood:,-599.93
No. Observations:,50,AIC:,1204.0
Df Residuals:,48,BIC:,1208.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.13e+05,5734.430,19.701,0.000,1.01e+05,1.25e+05
x1,-4.805e+04,4.05e+04,-1.185,0.242,-1.3e+05,3.35e+04

0,1,2,3
Omnibus:,0.115,Durbin-Watson:,0.099
Prob(Omnibus):,0.944,Jarque-Bera (JB):,0.007
Skew:,-0.015,Prob(JB):,0.996
Kurtosis:,2.949,Cond. No.,7.15


In [None]:
# Our model is ready for prediction