In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [None]:
# load the data
dataset = pd.read_csv('50_Startups.csv')

In [None]:
dataset
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:,4].values

In [None]:
# sanity check 1
dataset.head()

In [None]:
# sanity check 2
dataset.shape

In [None]:
# # before we converted the state into a number using the LabelEncoder
# labelencoder_X = LabelEncoder()
# X[:, 3] = labelencoder_X.fit_transform(X[:, 3])

# # then we used the one hot encoder to encode the numbers from the previous step
# onehotencoder = OneHotEncoder(categorical_features=[3])
# X = onehotencoder.fit_transform(X).toarray()

In [None]:
# instantiate the one hot column encoder so that we process the State column
# save the encoded features in a variable to be appended to the list of features
preprocess = make_column_transformer((OneHotEncoder(), [3]))
onehotencoded = preprocess.fit_transform(X)

# drop the 'State' categorical variable
X = np.delete(X, 3, axis=1)

In [None]:
# combine them together
X = np.column_stack((onehotencoded, X))
X[:5,:]

X= X[:,1:]

In [None]:
X.shape

In [None]:
# split the dataset into training, testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [None]:
X_test

In [None]:
y_test

In [None]:
y_pred

In [None]:
# building the model using Backward Elimination
X = np.append(arr=np.ones((50,1)).astype(int), values=X, axis=1)

In [None]:
# first, run with all features
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y.tolist(), exog=X_opt.tolist()).fit()
regressor_OLS.summary()

In [None]:
# the second feture has a p-value above the significance level (SL) of 0.05
# hence, we remove
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y.tolist(), exog=X_opt.tolist()).fit()
regressor_OLS.summary()

In [None]:
# the first feture has a p-value above the significance level (SL) of 0.05
# hence, we remove
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y.tolist(), exog=X_opt.tolist()).fit()
regressor_OLS.summary()

In [None]:
# the fourth feture has a p-value above the significance level (SL) of 0.05
# hence, we remove
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog=y.tolist(), exog=X_opt.tolist()).fit()
regressor_OLS.summary()

In [None]:
# the fifth feture has a p-value above the significance level (SL) of 0.05
# hence, we remove
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog=y.tolist(), exog=X_opt.tolist()).fit()
regressor_OLS.summary()

In [None]:
# automatic backward elimination with p-values only
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y.tolist(), x.tolist()).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

In [None]:
def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50,6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y.tolist(), x.tolist()).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y.tolist(), x.tolist()).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)