# Importing the libraries

In [1]:
# Working with array
import numpy as np
# Plotting chart, graph
import matplotlib.pyplot as plt
# Import dataset. Create the matrix of features and the dependent variable vector.
# Preprocess dataset
import pandas as pd
# Process missing data
from sklearn.impute import SimpleImputer
# One hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
# Splitting
from sklearn.model_selection import train_test_split
# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Importing the dataset
- Create Data Frame
- Create Matrix of features & Dependent variable vector
    - Matrix of features: Independent variable. The variables containing some informations with which you can predict what you want to predict. The columns with which you're going to predict the dependent variable.
    - Dependent variable vector: The last column of dataset.

In [3]:
# Create Data frame
data_set = pd.read_csv('Data.csv')

# Matrix of Features, all the columns of the dataset except the last one
# iloc: Locate indexes, take the indexes of the column we want to extract from the dataset, we can get all the rows
# Select all rows, take all the columns except the last one
# ':': Taking every in the range, this case all the rows
# ':-1': Take the indexes from 0 to -1 (Excluding the last index)
# `value`: Taking the values
X = data_set.iloc[:, :-1].values

# Dependent variable vector (The last column of the dataset)
# `-1`: Get the last column
y = data_set.iloc[:, -1].values

In [4]:
print(f'{X}')
print('-----------------------------------------------------------------')
print(f'{y}')

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
-----------------------------------------------------------------
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Taking care of missing data
- Replace the missing value by the average of all values in the column in which the data is missing.
- Using instance of SimpleImputer to handle missing data.

In [6]:
# Create the object
# `np.nan`: Empty value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Connect the object to the matrix of features
# Column of X with NUMERICAL value
# Look at all the rows, column indexed 1, 2
imputer.fit(X=X[:, 1:3])

# Do the replacement of the missing values
# Note: The method only returns the columns specified, not all the matrix of features
X[:, 1:3] = imputer.transform(X=X[:, 1:3])

In [7]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding categorical data (One hot encoding
- We must turn (`encode`) the `string` categories into `number`
- One hot encoding: Creating Binary vector (Only 0 and 1), avoid numerical order
- The more categories, the more columns

## Encoding the Independent Variable

In [9]:
# Create an object of the Column Transformer class
# Arguments: Kind of transformation, indexes of the column we want to transform, the columns we want to keep
# `passthrough`: Keep the columns that won't be applied transformation
column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# Return the new matrix of features that one hot encoded
# We update the current matrix of features
# The method doesn't return a numpy array -> Force the output of this method to be numpy array
X = np.array(column_transformer.fit_transform(X=X))

In [10]:
print(X)

[[0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]


## Encoding the Dependent Variable

In [13]:
# Create object
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y=y)

In [14]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting (random) the dataset into the Training set and Test set
- Training set: Train ML model on existing observations. -> More data than test set (80%) -> Give the model more chance to understand and learn the correlations in the dataset.
- Test set: Evaluate the performance of the model on new observations (future data).
- Four parts:
    - X_train, X_test: Matrix of features
    - y_train, y_test: Dependent variable
- Why? The ML model expecting all of 04 parts as input
    - Training: X_train, y_train -> fit method
    - Prediction|Inference: X_test, y_test -> predict method

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [17]:
print(X_train)
print('----------------------------------------------')
print(X_test)
print('----------------------------------------------')
print(y_train)
print('----------------------------------------------')
print(y_test)

[[1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 35.0 58000.0]]
----------------------------------------------
[[1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]
----------------------------------------------
[0 1 0 0 1 1 0 1]
----------------------------------------------
[0 1]


# Feature Scaling
- Scale all the features to make sure they all take values in the same scale/range.
- Prevent one feature to dominate the other, avoid some features to be dominated by other features in such a way that they're not even considered by the ML model.
- Get the mean and standard deviation of the feature
- Why apply Feature Scaling after Splitting? Test set: A brand new set. Not relate to training set. Prevent infomation leakage on the test set.
- Do not apply to Binary-valued variables

In [18]:
# Create object of Feature Scaling class
scaler = StandardScaler()
# `fit`: For each feature of X_train, compute the mean & the standard deviation of the feature
# `transform`: Apply Standardisation, transform each of the value of each feature into the result of Standardisation formula.
X_train[:, 3:] = scaler.fit_transform(X_train[:, 3:])
X_test[:, 3:] = scaler.transform(X_test[:, 3:])

In [19]:
print(X_train)
print('----------------------------------------------')
print(X_test)

[[1.0 0.0 0.0 1.2909944487358056 -0.19159184384578545 -1.0781259408412425]
 [1.0 0.0 1.0 -0.7745966692414834 -0.014117293757057777
  -0.07013167641635372]
 [0.0 1.0 0.0 -0.7745966692414834 0.566708506533324 0.633562432710455]
 [1.0 0.0 0.0 1.2909944487358056 -0.30453019390224867
  -0.30786617274297867]
 [1.0 0.0 0.0 1.2909944487358056 -1.9018011447007988 -1.420463615551582]
 [0.0 1.0 0.0 -0.7745966692414834 1.1475343068237058 1.232653363453549]
 [1.0 0.0 1.0 -0.7745966692414834 1.4379472069688968 1.5749910381638885]
 [0.0 1.0 0.0 -0.7745966692414834 -0.7401495441200351 -0.5646194287757332]]
----------------------------------------------
[[1.0 0.0 1.0 -0.7745966692414834 -1.4661817944830124 -0.9069571034860727]
 [0.0 1.0 0.0 -0.7745966692414834 -0.44973664397484414 0.2056403393225306]]
