In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('50_Startups.csv')

x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [2]:
dummy = pd.get_dummies(pd.Series(x[:, 3]))
data = pd.DataFrame({'R&D Spent':x[:,0],'Administration':x[:,1], 'Marketing Spent':x[:,2]})
data = pd.concat([dummy, data], axis=1)
x = data.iloc[:, :].values
x = x.astype(float) 

In [3]:
# Dummy Variable trap:
x = x[:, 1:]

In [4]:
# Here we dont have to do the eliminations manually. We write a function for it which takes in the ndarray and the SL
import statsmodels.formula.api as sm
x = np.append(np.ones((50, 1)).astype(int), x, axis = 1)         # Adding the column of 1's
def backwardElimination(x, sl):
    numVars = len(x[0])                                          # Just to get the number of columns. '0' is the row number
    for i in range(0, numVars):                                  # Checks for the p-value of all the columns individually
        regressor_OLS = sm.OLS(y, x).fit()                       # Fits into the regressor
        maxVar = max(regressor_OLS.pvalues).astype(float)        # Finds the maximum p-value among them as type 'float'
        if maxVar > sl:                                          # Checks if that 'max p-value' is >SL
            for j in range(0, numVars - i):                   # Then in this loop we just check which index has the max p-value
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, axis = 1)               # Then we just remove that column from the array along axis = 1
    regressor_OLS.summary()                                     # Then we generate the summary. Actually not an imp step
    return x

# regressor_OLS.pvalues returns an array of the p-values, which can be accessed based on each index
# This is just a loop that does all the function by itslef and we don't have to do these things manually

In [5]:
SL = 0.05
x_opt = backwardElimination(x, SL)

In [6]:
x_opt           # Consists only of the rows of 1's and the most significant variable i.e, R%D Spent

array([[1.0000000e+00, 1.6534920e+05],
       [1.0000000e+00, 1.6259770e+05],
       [1.0000000e+00, 1.5344151e+05],
       [1.0000000e+00, 1.4437241e+05],
       [1.0000000e+00, 1.4210734e+05],
       [1.0000000e+00, 1.3187690e+05],
       [1.0000000e+00, 1.3461546e+05],
       [1.0000000e+00, 1.3029813e+05],
       [1.0000000e+00, 1.2054252e+05],
       [1.0000000e+00, 1.2333488e+05],
       [1.0000000e+00, 1.0191308e+05],
       [1.0000000e+00, 1.0067196e+05],
       [1.0000000e+00, 9.3863750e+04],
       [1.0000000e+00, 9.1992390e+04],
       [1.0000000e+00, 1.1994324e+05],
       [1.0000000e+00, 1.1452361e+05],
       [1.0000000e+00, 7.8013110e+04],
       [1.0000000e+00, 9.4657160e+04],
       [1.0000000e+00, 9.1749160e+04],
       [1.0000000e+00, 8.6419700e+04],
       [1.0000000e+00, 7.6253860e+04],
       [1.0000000e+00, 7.8389470e+04],
       [1.0000000e+00, 7.3994560e+04],
       [1.0000000e+00, 6.7532530e+04],
       [1.0000000e+00, 7.7044010e+04],
       [1.0000000e+00, 6.