In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [2]:
# load the data
dataset = pd.read_csv('50_Startups.csv')

In [3]:
dataset
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:,4].values

In [4]:
# sanity check 1
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
# sanity check 2
dataset.shape

(50, 5)

In [6]:
# # before we converted the state into a number using the LabelEncoder
# labelencoder_X = LabelEncoder()
# X[:, 3] = labelencoder_X.fit_transform(X[:, 3])

# # then we used the one hot encoder to encode the numbers from the previous step
# onehotencoder = OneHotEncoder(categorical_features=[3])
# X = onehotencoder.fit_transform(X).toarray()

In [7]:
# instantiate the one hot column encoder so that we process the State column
# save the encoded features in a variable to be appended to the list of features
preprocess = make_column_transformer((OneHotEncoder(), [3]))
onehotencoded = preprocess.fit_transform(X)

# drop the 'State' categorical variable
X = np.delete(X, 3, axis=1)

In [8]:
# combine them together
X = np.column_stack((onehotencoded, X))
X[:5,:]

X= X[:,1:]

In [9]:
X.shape

(50, 5)

In [10]:
# split the dataset into training, testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
X_train.shape, y_train.shape

((40, 5), (40,))

In [12]:
X_test.shape, y_test.shape

((10, 5), (10,))

In [13]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [14]:
X_test

array([[1.0, 0.0, 66051.52, 182645.56, 118148.2],
       [0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 27892.92, 84710.77, 164470.71],
       [1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 1.0, 72107.6, 127864.55, 353183.81],
       [0.0, 1.0, 20229.59, 65947.93, 185265.1],
       [0.0, 1.0, 61136.38, 152701.92, 88218.23],
       [1.0, 0.0, 73994.56, 122782.75, 303319.26],
       [1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

In [15]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

In [16]:
y_pred

array([103015.20159796, 132582.27760816, 132447.73845175,  71976.09851259,
       178537.48221054, 116161.24230163,  67851.69209676,  98791.73374688,
       113969.43533012, 167921.0656955 ])

In [17]:
# building the model using Backward Elimination
X = np.append(arr=np.ones((50,1)).astype(int), values=X, axis=1)

In [24]:
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y.tolist(), exog=X_opt.tolist()).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Fri, 18 Oct 2019",Prob (F-statistic):,1.34e-27
Time:,12:19:08,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0
