In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Before doing any regression model, you have to make sure that the 
# assumptions are true which are:
# Linearity, Homoscedasticity, Multivariate Normality, Independence of Errors, Lack of Multicollinearity

# Case study- venture capitalist hired u to create a model to tell them which 
# company is interesting for the vc to invest in, which the feature of interest is profit
# obviously they are not looking for the one that has the most profit
# they want to understand which companies perform better in each of the states for examp.
# or,  if you hold the states column equal does the company perform better with more
# marketing spend or not?
# and between r&d spend and marketing spend which one yields better results
dataset = pd.read_csv('/50_Startups.csv')


# so rmb that for categoricals, we have to create dummy variables
# dont include every dummy variable to compute, in this case with 
# only two outcomes(newyork and cali) you can use only one dummy
# as new york and when its 1 its newyork and 0 if its california 

# d1 for newyork dummy d2 for cali
# y = b0 + b1x1 + b2x2 + b3x3 + b4d1 + b5d2
# Dummy var trap - d2 = 1 - d1 will always be true 
# effect where a variable is used to predict on another variable is called Multicollinearity
# solution is always omit one dummy var
# if we have two categorical features then apply this solution to both features
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
x[:,3] = labelencoder_x.fit_transform(x[:,3])
# so we need to turn into numbers because onehotencode only reads numbers
onehotencoder = OneHotEncoder(categorical_features = [3])
x = onehotencoder.fit_transform(x).toarray()

# Avoiding the dummy variable trap
# so this means we take out the first column
# first of all we can do it or we don't have to becuase the this library takes care of dummy trap
x = x[:, 1:]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [2]:
# no need to reshape x because its already a matrix
y_train = y_train.reshape(-1,1)

In [3]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

# Prediting the test sample set
y_pred = regressor.predict(x_test)


In [4]:
# Building the optimal model using backward elimination 
# rmb that the constant b0 in the formula has an x0 term to it which is an array of 1
# now the statsmodel library does not add the x0 for us, like the Linear Regression library did
import statsmodels.formula.api as sm
# in this one we use ones to create a matrix of 50 lines and one column
# the ones are in type int and axis specifies that we add it as a column
# but this will append the ones to the last column we want it first so 
x = np.append(arr = np.ones((50,1)).astype(int),values = x,axis = 1)



In [6]:
# First, create the optimal features matrix which will only contain optimal features
x_opt = x[:,[0,1,2,3,4,5]]
# endog is dependent variable and exog is independent variable
regressor_OLS = sm.OLS(endog= y, exog = x_opt).fit()
regressor_OLS.summary()
# so we look at the p value and then cut out the feature with highest p_value
# in this case its x2 or administration spend 
x_opt = x[:,[0,1,3,4,5]]
regressor_OLS = sm.OLS(endog= y, exog = x_opt).fit()
regressor_OLS.summary()
# keep repeating and removing features
x_opt = x[:,[0,3,4,5]]
regressor_OLS = sm.OLS(endog= y, exog = x_opt).fit()
regressor_OLS.summary()
x_opt = x[:,[0,3,5]]
regressor_OLS = sm.OLS(endog= y, exog = x_opt).fit()
regressor_OLS.summary()
# you'll see that p value for x1 is 0 but it is not, its just so small in magnitude
# now it results to be that column 5  has 0.06 in p value, but i dont
# think we should remove it, we will evaluate the model laterr

# BUT WE CANNOT CONCLUDE THAT R&D SPEND IS MORE SIGNIFICANT THAN MARKETING SPEND M

# We should evaluate Adj. R-squared because R-squared will be subjected to bias from reduction
# -of variables. 
# Deciding whether or not to remove the column 5 feature depends on whether upon removing,
# - the adjusted r-squared will decrease or increase. We will pick the one that has a bigger adjusted R-Squared.


0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 03 Jul 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,13:23:54,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [42]:
# NOW THE AUTOMATED VERSION
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
 
SL = 0.07
x_opt = x[:, [0, 1, 2, 3, 4, 5]]
x_modeled = backwardElimination(x_opt, SL)

In [43]:
x_modeled

array([[  1.00000000e+00,   1.65349200e+05,   4.71784100e+05],
       [  1.00000000e+00,   1.62597700e+05,   4.43898530e+05],
       [  1.00000000e+00,   1.53441510e+05,   4.07934540e+05],
       [  1.00000000e+00,   1.44372410e+05,   3.83199620e+05],
       [  1.00000000e+00,   1.42107340e+05,   3.66168420e+05],
       [  1.00000000e+00,   1.31876900e+05,   3.62861360e+05],
       [  1.00000000e+00,   1.34615460e+05,   1.27716820e+05],
       [  1.00000000e+00,   1.30298130e+05,   3.23876680e+05],
       [  1.00000000e+00,   1.20542520e+05,   3.11613290e+05],
       [  1.00000000e+00,   1.23334880e+05,   3.04981620e+05],
       [  1.00000000e+00,   1.01913080e+05,   2.29160950e+05],
       [  1.00000000e+00,   1.00671960e+05,   2.49744550e+05],
       [  1.00000000e+00,   9.38637500e+04,   2.49839440e+05],
       [  1.00000000e+00,   9.19923900e+04,   2.52664930e+05],
       [  1.00000000e+00,   1.19943240e+05,   2.56512920e+05],
       [  1.00000000e+00,   1.14523610e+05,   2.6177623