In [1]:
# Here we have to predict the profit(dependent variable) based on the 4 independent variables(country and the 3 spendings)

In [21]:
# In this case the regression line will be y = a0 + a1x1 + a2x2 + a3x3. Each 'x' being an independent 'spent' variable
# But what about the 'state' variable? 'state' is a categorical variable. We can't add it to our  regression equation
# Refer copy for the details on 'steps of building regression model' and 'backward elimination process'

In [22]:
# Data Preprocessing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')

# Separating into dependent and independent variables
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [23]:
# Dummy Encoding
'''
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
x[:, 3] = labelencoder_x.fit_transform(x[:, 3])
'''

# Using OneHotEncoder requires the splitting of categories into numbers using LabelEncoder
# Using pandas doesnt require that
# OneHotEncoder converts the 'x' variable from 'object' to 'float64' which pandas doesn't do. Remember this.
# While backward elimination there causes a type error with 'object' type and not with 'float64' type.
# We can either use OHE or we can just coinvert the ndarray into 'float64'

'\nfrom sklearn.preprocessing import LabelEncoder, OneHotEncoder\nlabelencoder_x = LabelEncoder()\nx[:, 3] = labelencoder_x.fit_transform(x[:, 3])\n'

In [24]:
# Here we dummy encode with pandas and not OHE and then we convert the 'x' from 'object' type to 'float' type
dummy = pd.get_dummies(pd.Series(x[:, 3]))        # We need the dummies of column 4
data = pd.DataFrame({'R&D Spent':x[:,0],'Administration':x[:,1], 'Marketing Spent':x[:,2]})
data = pd.concat([dummy, data], axis=1)
x = data.iloc[:, :].values
x = x.astype(float)                    # Converting the ndarray into float from object type

In [25]:
'''onehotencoder = OneHotEncoder(categorical_features = [3])
x = onehotencoder.fit_transform(x).toarray()
'''

'onehotencoder = OneHotEncoder(categorical_features = [3])\nx = onehotencoder.fit_transform(x).toarray()\n'

In [26]:
x.dtype        # Ignore the bizzare results
               # See its 'float64'

dtype('float64')

In [27]:
# Avoiding the Dummy Variable Trap
# Here we gotta remove 1 dummy variable

x = x[:, 1:]
# Here, we just removed the 'California' column
# Generally this trap is been taken care of by the libraries itself so we don't actually need to do that
# But just to be sure. Don't do it manually like this from the next time

In [28]:
# Splitting into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Data preprocessing done

In [29]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

In [30]:
# Fitting the Multiple Linear Regression to the training set
# Similar way. Importing the library and creating the object

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)     # As in Simple linear regression it now correlates all the variables in x_train wth y_train


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [31]:
# Predicting the test set
y_pred = regressor.predict(x_test)

In [32]:
y_pred

array([103015.20159796, 132582.27760816, 132447.73845175,  71976.09851259,
       178537.48221054, 116161.24230163,  67851.69209676,  98791.73374688,
       113969.43533012, 167921.0656955 ])

In [33]:
'''Backward elimination'''

'Backward elimination'

In [34]:
# Thus there is a multiple linear dependence. But Move forward to see how we can make our algorithm predict better

In [35]:
# We could actually build a better model by removing certain variables or features that are not that statistically significant
# to make the best of predictions. To do that, among the 5 method of building a model, we will use 'backward elimination'
# Which is technically the best

# So we actually find a team of independent variables that predict the best value of the dependent variable

In [36]:
# Building the optimal model using backward elimination

In [37]:
# Preparation of the matrix
import statsmodels.formula.api as sm

# We gotta add a column of 1's in the regression model because in the regression equation y = a0 + a1x1 .. anxn
# 'a0' is actually 'a0x0' with x0 = 1
# The 'statsmodels' library doesn't do it itself actually and we gotta manually do it
# The LinearRegression class actually can consider a constant 'a0' while making of the regression line
# But this library doesn't work that way, so we gotta add a column of 1's in the ndarray 'x'

x = np.append(np.ones((50, 1)).astype(int), x, axis = 1)
# np.ones() takes in a tuple of number of rows and columns and creates a matrix of 1's all throughout upon which 'x' gets added
# along the rows as we see axis = 1

x        # See. 1st column contains 50 ones.
         # Ignore the bizarre results caused due to conversion into 'float' but don't worry the data is alrighty

array([[1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

In [50]:
# Now backward elimination begins:

# 'x_opt' will only contain the matrix of features with highest significance on the dependent variable
x_opt = x[:, [0, 1, 2, 3, 4, 5]]
# We write all the columns individually because the algorithm will be removing them 1 by 1 if required

# 1. selecting of significance level

# 2. Fit the full model with all the independent variables
# In this step, we need to newly fit the model'x_opt' using the library 'statsmodels'. Basically we gotta create a new regressor
# The new class is called Ordinary Least Squares
regressor_OLS = sm.OLS(y, x_opt).fit()
# On seeing the list of parameters we see, it takes in the dependent variable first
# and then the optimised matrix of features
# It also says that the intercept is not included so that's why we add the 1's
# The OLS class works on 'float64' type and not on 'object' type. So we had to convert the ndarray

In [51]:
# 3. Consider the predictor with the highest p-value
# This function returns a table of information
# The lower the p-value of a variable more significant will it be for the model
regressor_OLS.summary()

# Here we have the x0 as 'constant'
# x1, x2-> dummy variables
# x3->R%D Spent
# x4->Admin spent
# x5->Marketing spent
# And we have the corresponding coefficients, p-values etc.
# Significance level is taken as 0.05 by the library itself as it is the most commonly used

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 16 Jun 2019",Prob (F-statistic):,1.34e-27
Time:,13:04:14,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [52]:
# 4. We gotta remove the variable with p-value > SL. So we see x2 has p-value 0.99 > 0.05 So we gotta remove it
x_opt = x[:, [0, 1, 3, 4, 5]]        # Thus we've removed x2
regressor_OLS = sm.OLS(y, x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sun, 16 Jun 2019",Prob (F-statistic):,8.49e-29
Time:,13:07:59,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [53]:
x_opt = x[:, [0, 3, 4, 5]]        # Removed x1 corresponding to the 'x_opt' because it has highest p-value = 0.94 > 0.05
regressor_OLS = sm.OLS(y, x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sun, 16 Jun 2019",Prob (F-statistic):,4.53e-30
Time:,13:10:35,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [54]:
x_opt = x[:, [0, 3, 5]]        # Removed x2 corresponding to the previous 'x_opt'
regressor_OLS = sm.OLS(y, x_opt).fit()
regressor_OLS.summary()

# p-value can actually not be '0'. Here it's shown 0 cz the value is way too small

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sun, 16 Jun 2019",Prob (F-statistic):,2.1600000000000003e-31
Time:,13:11:01,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [55]:
# Thus we see that 'x1' here i.e, the R&D spent has a very high significance towards predicting the dependent variable

# But we also gotta remove 'x2' from above, cz its p-value = 0.06 > 0.05
x_opt = x[:, [0, 3]]        # x2 removed i.e, the 5th column
regressor_OLS = sm.OLS(y, x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 16 Jun 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,13:16:56,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [56]:
# Thus for now we only have 1 strong predictor with a very low p-value, i.e, with a very high significance
# Thus the optimal team consists of actually only 1 variable for this dataset.
# Now we will make predictions based on the optimal set we made i.e, the 'x_opt' array

x_train, x_test, y_train, y_test = train_test_split(x_opt, y, test_size = 0.2, random_state = 0)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

In [57]:
y_pred
# So these are the predictions made by the optimal features which have a really high significance towards predicting 'y_pred'

array([104667.27805998, 134150.83410578, 135207.80019517,  72170.54428856,
       179090.58602508, 109824.77386586,  65644.27773757, 100481.43277139,
       111431.75202432, 169438.14843539])

In [58]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])