### Importing libraries

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Importing the dataset

In [56]:
startups_data = pd.read_csv("50_startups.csv")
X = startups_data.iloc[:, :-1].values
y = startups_data.iloc[:, -1].values
startups_data.head(n=10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


### Assumptions of linear regression

In [57]:
# 1. Linearity
# 2. Homoscedasticity - variance of residuals is the same for any value of X
# 3. Multivariate normality 
# 4. Independence of errors
# 5. Lack of multicollinearity - independent variables are not linear combinations of each other

### Model Building Techniques

In [58]:
# 1. All-in: 
# Use all variables (only done if there is certainty)

# 2. Forward Selection:
# Select a significance level
# Fit the minimal model with the predictor that has lowest p-val
# Add all possible variables one-by-one
# Consider predictor with lowest p-val. If p < SL, add more predictors
# Otherwise, the model is complete
# Repeat until all p-values are lower than SL or there are no more predictors


# 3. Backward Elimination: 
# Select a significance level
# Fit the full model with all possible predictors
# Consider predictor with highest p-value, if p > SL, remove predictor
# Continue until all predictors are removed or all p-values are low enough



# 4. Step-wise/bidirectional elimination:
# Select significance level to enter (SL_enter) & to stay (SL_stay) in the model
# Fit the minimal model with predictor that has lowest p-val
# Perform all steps of backward elimination using SL_stay
# Repeat until no new variables can enter or leave the model


# 5. Score Comparison
# Select goodness of fit criterion (e.g. AIC)
# Construct all possible regression models: 2^n - 1 combinations for n predictors
# Select the model with the best criterion

### Encoding categorical data

In [59]:
# we must avoid including all dummy variables in our model
# in the case of binary data, we know we have 0 or 1
# this creates multicollinearity
# hence our model cannot include ALL dummy variables
# in general, given n dummy variables, our model should include
# (n-1) dummy variables


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [3])], remainder="passthrough")
X = np.array(ct.fit_transform(X))
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

### Feature Scaling

In [60]:
# No need for feature scaling in multiple linear regression
# this is because the coefficients will scale the features
# by design

### Splitting dataset into training and testing set

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Training the multiple linear regression model on the training set

In [62]:
# We do not need to consider the "dummy variable trap"
# because the following Python class will do this for us

# The following Python class will determine the most statistically
# significant predictors to add to our model
# We are therefore using Automated Model Selection

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Predicting the testing set results

In [63]:
y_pred = regressor.predict(X_test)

# display all numerical values to 2 decimal poitns
np.set_printoptions(precision=2) 

# Compare y_pred and y_test by concatenating them
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_pred), 1)), 1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


### Using the model to make a single prediction

In [64]:
# Let:
# R&D Spend = 160000
# Administration Spend = 130000
# Marketing Spend = 300000
# State = "California" (Translates to vector (1, 0, 0))

# We use our model to estimate the startup's profit:
print(regressor.predict([[1, 0, 0, 160000, 130000, 300000]]))

# The estimated profit is $181566.92

[181566.92]


### Extracting the model coefficients

In [67]:
print(regressor.coef_) # All our predictor slopes

[ 8.66e+01 -8.73e+02  7.86e+02  7.73e-01  3.29e-02  3.66e-02]


In [68]:
print(regressor.intercept_) # The model intercept

42467.529248581035


In [69]:
# Our model can be described by:
# Profit=86.6×Dummy State 1 − 873×Dummy State 2
# + 786×Dummy State 3 − 0.773×R&D Spend + 0.0329×Administration
# + 0.0366×Marketing Spend + 42467.53