In [35]:
# importing libraries
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd


In [36]:
# importing datasets
data_set = pd.read_csv('../../dataset/LinearRegression/50_Startups.csv')
# Extracting independent and dependent variables
x = data_set.iloc[:, :-1].values
y = data_set.iloc[:, 4].values
data_set.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [37]:
# Categorical Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder_x = LabelEncoder()
x[:, 3] = labelencoder_x.fit_transform(x[:, 3])
ct = ColumnTransformer([('onthot', OneHotEncoder(), [3])], remainder='passthrough')
x = ct.fit_transform(x)
nm.set_printoptions(suppress=True)
x[:5]

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

In [38]:
# remove multicollinearity
x = x[:, 1:]
x[:5]

array([[0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/5, random_state=0)
print(x_test[:5])
y_test[:5]

[[1.0 0.0 66051.52 182645.56 118148.2]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 27892.92 84710.77 164470.71]
 [1.0 0.0 153441.51 101145.55 407934.54]]


array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39])

In [40]:
# Fitting the MLR model to the dataset
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
x_pred = regressor.predict(x_train)
y_pred = regressor.predict(x_test)
print(y_train[:5])
print(x_pred[:5])
print('Train Score: ', regressor.score(x_train, y_train))
print(y_test[:5])
print(y_pred[:5])
print('Test Score: ', regressor.score(x_test, y_test))


[ 96778.92  96479.51 105733.54  96712.8  124266.9 ]
[ 95764.42320496  89164.62849777 109506.72329086  91366.21889409
 127111.48666646]
Train Score:  0.9501847627493607
[103282.38 144259.4  146121.95  77798.83 191050.39]
[103015.20159796 132582.27760816 132447.73845175  71976.09851259
 178537.48221054]
Test Score:  0.9347068473282966


In [None]:
############ Backward Elimination

In [13]:
import statsmodels.api as sm

In [41]:
x = nm.append(arr=nm.ones((50,1)).astype(int), values=x, axis=1)
x[:5]

array([[1, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [1, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1, 1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

In [45]:
x_opt = x[:, [0,1,2,3,4,5]].astype(float)
regressor_OLS = sm.OLS(endog=y, exog=x_opt).fit()
regressor_OLS.summary()
# x_opt[:5]

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 03 Nov 2024",Prob (F-statistic):,1.34e-27
Time:,04:49:33,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [46]:
x_opt = x[:, [0,2,3,4,5]].astype(float)
regressor_OLS = sm.OLS(endog=y, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sun, 03 Nov 2024",Prob (F-statistic):,8.5e-29
Time:,04:49:59,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.018e+04,6747.623,7.437,0.000,3.66e+04,6.38e+04
x1,-136.5042,2801.719,-0.049,0.961,-5779.456,5506.447
x2,0.8059,0.046,17.571,0.000,0.714,0.898
x3,-0.0269,0.052,-0.521,0.605,-0.131,0.077
x4,0.0271,0.017,1.625,0.111,-0.007,0.061

0,1,2,3
Omnibus:,14.892,Durbin-Watson:,1.284
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.665
Skew:,-0.949,Prob(JB):,1.97e-05
Kurtosis:,5.608,Cond. No.,1430000.0


In [47]:
x_opt = x[:, [0,3,4,5]].astype(float)
regressor_OLS = sm.OLS(endog=y, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sun, 03 Nov 2024",Prob (F-statistic):,4.53e-30
Time:,04:50:13,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [48]:
x_opt = x[:, [0,3,5]].astype(float)
regressor_OLS = sm.OLS(endog=y, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sun, 03 Nov 2024",Prob (F-statistic):,2.1600000000000003e-31
Time:,04:50:26,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [49]:
x_opt = x[:, [0,3]].astype(float)
regressor_OLS = sm.OLS(endog=y, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 03 Nov 2024",Prob (F-statistic):,3.5000000000000004e-32
Time:,04:50:37,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [64]:
x_BE = data_set.iloc[:, 0].values # only `R&D spend`
y_BE = data_set.iloc[:, 4].values # only Profit

x_BE_train, x_BE_test, y_BE_train, y_BE_test= train_test_split(x_BE, y_BE, test_size= 0.2, random_state=0)
# print(x_BE_train[:5])
regressor.fit(nm.array(x_BE_train).reshape(-1, 1), y_BE_train)

x_BE_pred = regressor.predict(nm.array(x_BE_train).reshape(-1, 1))
y_BE_pred = regressor.predict(nm.array(x_BE_test).reshape(-1, 1))
print(y_BE_train[:5])
print(x_BE_pred[:5])
print('Backward Elimination Train Score: ', regressor.score(nm.array(x_BE_train).reshape(-1, 1), y_BE_train))
print(y_BE_test[:5])
print(y_BE_pred[:5])
print('Backward Elimination Test Score: ', regressor.score(nm.array(x_BE_test).reshape(-1, 1), y_BE_test))


[ 96778.92  96479.51 105733.54  96712.8  124266.9 ]
[ 95676.21073667  87602.88620742 112568.08084243  87953.79738211
 126551.97418703]
Backward Elimination Train Score:  0.9449589778363044
[103282.38 144259.4  146121.95  77798.83 191050.39]
[104667.27805998 134150.83410578 135207.80019517  72170.54428856
 179090.58602508]
Backward Elimination Test Score:  0.9464587607787219
