# Multiple Linear Regression

# Importing the libraries

In [59]:
# Working with array
import numpy as np
# Plotting chart, graph
import matplotlib.pyplot as plt
# Import dataset. Create the matrix of features and the dependent variable vector.
# Preprocess dataset
import pandas as pd
# Process missing data
from sklearn.impute import SimpleImputer
# One hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
# Splitting
from sklearn.model_selection import train_test_split
# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Display any numerical value with only 02 decimals afer comma
np.set_printoptions(precision=1)

# Importing the dataset
- Create Data Frame
- Create Matrix of features & Dependent variable vector
    - Matrix of features: Independent variable. The variables containing some informations with which you can predict what you want to predict. The columns with which you're going to predict the dependent variable.
    - Dependent variable vector: The last column of dataset.

In [60]:
# Create Data frame
data_set = pd.read_csv('50_Startups.csv')

# Matrix of Features, all the columns of the dataset except the last one
# iloc: Locate indexes, take the indexes of the column we want to extract from the dataset, we can get all the rows
# Select all rows, take all the columns except the last one
# ':': Taking every in the range, this case all the rows
# ':-1': Take the indexes from 0 to -1 (Excluding the last index)
# `value`: Taking the values
X = data_set.iloc[:, :-1].values

# Dependent variable vector (The last column of the dataset)
# `-1`: Get the last column
y = data_set.iloc[:, -1].values

In [61]:
print(f'X:\n{X}')
print('-----------------------------------------------------------------')
print(f'Y:\n{y}')

X:
[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67

# Encoding categorical data (One hot encoding
- We must turn (`encode`) the `string` categories into `number`
- One hot encoding: Creating Binary vector (Only 0 and 1), avoid numerical order
- The more categories, the more columns

## Encoding the Independent Variable

In [62]:
# Create an object of the Column Transformer class
# Arguments: Kind of transformation, indexes of the column we want to transform, the columns we want to keep
# `passthrough`: Keep the columns that won't be applied transformation
column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')

# Return the new matrix of features that one hot encoded
# We update the current matrix of features
# The method doesn't return a numpy array -> Force the output of this method to be numpy array
X = np.array(column_transformer.fit_transform(X=X))

In [63]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

# Splitting (random) the dataset into the Training set and Test set
- Training set: Train ML model on existing observations. -> More data than test set (80%) -> Give the model more chance to understand and learn the correlations in the dataset.
- Test set: Evaluate the performance of the model on new observations (future data).
- Four parts:
    - X_train, X_test: Matrix of features
    - y_train, y_test: Dependent variable
- Why? The ML model expecting all of 04 parts as input
    - Training: X_train, y_train -> fit method
    - Prediction|Inference: X_test, y_test -> predict method

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [65]:
print(f'X_train\n{X_train}')
print('----------------------------------------------')
print(f'X_test\n{X_test}')
print('----------------------------------------------')
print(f'y_train\n{y_train}')
print('----------------------------------------------')
print(f'y_test\n{y_test}')

X_train
[[1.0 0.0 0.0 63408.86 129219.61 46085.25]
 [1.0 0.0 0.0 38558.51 82982.09 174999.3]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 28663.76 127056.21 201126.82]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 23640.93 96189.63 148001.11]
 [1.0 0.0 0.0 0.0 116983.8 45173.06]
 [0.0 1.0 0.0 75328.87 144135.98 134050.07]
 [0.0 1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 27892.92 84710.77 164470.71]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 0.0 1.0 1000.23 124153.04 1903.93]
 [0.0 0.0 1.0 77044.01 99281.34 140574.81]
 [0.0 1.0 0.0 67532.53 105751.03 304768.73]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 0.0 55493.95 103057.49 214634.81]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 0.0 61994.48 115641.28 91131.24]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 1.0 0.0 66051.52 182645.56 118148.2]
 [1.0 0.0 0.0 22177.74 154806.14 28334.72]
 [1.0 0.0 0.0 46426.07 157693.92 210797.67]
 [0.

# Training the Multiple Linear Regression model on the Training set
- The class will automatically avoid `Dummy variable trap`

In [66]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

# Train the model
print('Training model ...')
regressor.fit(X=X_train, y=y_train)
print('Training completed!')

Training model ...
Training completed!


# Predicting the Test set results

In [67]:
# Get vector of predictions | The predicted value of the model for the test set | Input the features of test set
y_predicted = regressor.predict(X=X_test)

print(f'y_test\n{y_test}\n')
print('-----------------------------------------------------------------')
print(f'y_predicted\n{y_predicted}\n')
print('-----------------------------------------------------------------')

# Display the real values and the predicted values together
# reshape(len(y_predicted), 1) -> Turn into the vertical vector
# axis=1 -> Concatenate horizontally 02 vertical vectors of `y`
print(np.concatenate((y_predicted.reshape(len(y_predicted), 1), y_test.reshape(len(y_test), 1)), axis=1))

y_test
[105008.3  96479.5  78239.9  81229.1 191050.4 182902.   35673.4 101004.6
  49490.8  97483.6]

-----------------------------------------------------------------
y_predicted
[114664.4  90593.2  75692.8  70221.9 179790.3 171576.9  49753.6 102276.7
  58649.4  98272. ]

-----------------------------------------------------------------
[[114664.4 105008.3]
 [ 90593.2  96479.5]
 [ 75692.8  78239.9]
 [ 70221.9  81229.1]
 [179790.3 191050.4]
 [171576.9 182902. ]
 [ 49753.6  35673.4]
 [102276.7 101004.6]
 [ 58649.4  49490.8]
 [ 98272.   97483.6]]
