Import libraries and dataset

In [4]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("Data.csv")

# Separate the matrix from the independent terms
data_matrix = dataset.iloc[:, :-1].values
independent_terms = dataset.iloc[:, 3].values

dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Treatment of NAs

In [5]:
from sklearn.impute import SimpleImputer 

# NaN values ​​will be replaced by the mean
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

# I am going to use the passed object to apply it to the data matrix
# the columns of 1:3 because the strings do not have mean
imputer.fit(data_matrix[:, 1:3])

# The data matrix will now be equal to the transformed matrix
data_matrix[:, 1:3] = imputer.transform(data_matrix[:,1:3])

result_data_treatment_nan = pd.DataFrame({'Country': data_matrix[:, 0], 'Age': data_matrix[:, 1], 
                                          'Salary': data_matrix[:, 2], 'Purchased': independent_terms})
result_data_treatment_nan

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Encode categorical data

In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Create a data encoder
labelencoder_data = LabelEncoder()

# Transform the columns that are categories and transform them to numerical data. In this case the names of the countries
data_matrix[:, 0] = labelencoder_data.fit_transform(data_matrix[:, 0])

# ! Making this transformation has a problem, the countries do not have an order. That is why the "One Hot" technique is 
# used where we will have a vector for each variable that we have in the dataset. This is used to represent categorical 
# variables in the form of binary vectors. These are called "dummy variables."

# Select the first column 
column_transform = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],  remainder='passthrough')

# Transform it
data_matrix = column_transform.fit_transform(data_matrix)

# Create a data encoder for the column of independent terms
labelencoder_ind_terms = LabelEncoder()

# Transform it
independent_terms = labelencoder_ind_terms.fit_transform(independent_terms)

result_data_encode = pd.DataFrame({'France': data_matrix[:, 0], 'German': data_matrix[:, 1], 'Spain': data_matrix[:, 2], 
                                    'Age': data_matrix[:, 3], 'Salary': data_matrix[:, 4], 'Purchased': independent_terms})
result_data_encode

Unnamed: 0,France,German,Spain,Age,Salary,Purchased
0,1.0,0.0,0.0,44.0,72000.0,0
1,0.0,0.0,1.0,27.0,48000.0,1
2,0.0,1.0,0.0,30.0,54000.0,0
3,0.0,0.0,1.0,38.0,61000.0,0
4,0.0,1.0,0.0,40.0,63777.777778,1
5,1.0,0.0,0.0,35.0,58000.0,1
6,0.0,0.0,1.0,38.777778,52000.0,0
7,1.0,0.0,0.0,48.0,79000.0,1
8,0.0,1.0,0.0,50.0,83000.0,0
9,1.0,0.0,0.0,37.0,67000.0,1


Divide the dataset into training and testing set

In [7]:
from sklearn.model_selection import train_test_split

# 20% of the data will be used for testing. random_state = 2 it is so that the algorithm always divides equally
X_train, X_test, Y_train, Y_test  = train_test_split(data_matrix, independent_terms, 
                                                     test_size = 0.2, random_state = 0)

result_data_train = pd.DataFrame({'France': X_train[:, 0], 'German': X_train[:, 1], 'Spain': X_train[:, 2], 
                                    'Age': X_train[:, 3], 'Salary': X_train[:, 4], 'Purchased': Y_train})

result_data_test = pd.DataFrame({'France': X_test[:, 0], 'German': X_test[:, 1], 'Spain': X_test[:, 2], 
                                    'Age': X_test[:, 3], 'Salary': X_test[:, 4], 'Purchased': Y_test})

result_data_train

Unnamed: 0,France,German,Spain,Age,Salary,Purchased
0,0.0,1.0,0.0,40.0,63777.777778,1
1,1.0,0.0,0.0,37.0,67000.0,1
2,0.0,0.0,1.0,27.0,48000.0,1
3,0.0,0.0,1.0,38.777778,52000.0,0
4,1.0,0.0,0.0,48.0,79000.0,1
5,0.0,0.0,1.0,38.0,61000.0,0
6,1.0,0.0,0.0,44.0,72000.0,0
7,1.0,0.0,0.0,35.0,58000.0,1


In [8]:
result_data_test

Unnamed: 0,France,German,Spain,Age,Salary,Purchased
0,0.0,1.0,0.0,30.0,54000.0,0
1,0.0,1.0,0.0,50.0,83000.0,0


Variable scaling

X_stan = x - mean(x) / standard_deviation(x)

X_norm = x - min(x) / max(x) - min(x)

In [9]:
from sklearn.preprocessing import StandardScaler

# ! The data must be normalized or standardized so that they are all in the same ranges of values
# Do not scale dummy variables

# Create the scaler
scaler = StandardScaler()

# Transform training set
X_train[:, 3:5] = scaler.fit_transform(X_train[:, 3:5])

# Transform training set but with the same transformation
X_test[:, 3:5] = scaler.transform(X_test[:, 3:5])


result_train_processing = pd.DataFrame({'France': X_train[:, 0], 'German': X_train[:, 1], 'Spain': X_train[:, 2], 
                                        'Age': X_train[:, 3], 'Salary': X_train[:, 4], 'Purchased': Y_train})

result_test_processing = pd.DataFrame({'France': X_test[:, 0], 'German': X_test[:, 1], 'Spain': X_test[:, 2], 
                                        'Age': X_test[:, 3], 'Salary': X_test[:, 4], 'Purchased': Y_test})

result_train_processing

Unnamed: 0,France,German,Spain,Age,Salary,Purchased
0,0.0,1.0,0.0,0.263068,0.123815,1
1,1.0,0.0,0.0,-0.253501,0.461756,1
2,0.0,0.0,1.0,-1.975398,-1.530933,1
3,0.0,0.0,1.0,0.052614,-1.11142,0
4,1.0,0.0,0.0,1.640585,1.720297,1
5,0.0,0.0,1.0,-0.081312,-0.167514,0
6,1.0,0.0,0.0,0.951826,0.986148,0
7,1.0,0.0,0.0,-0.597881,-0.482149,1


In [10]:
result_test_processing

Unnamed: 0,France,German,Spain,Age,Salary,Purchased
0,0.0,1.0,0.0,-1.458829,-0.901663,0
1,0.0,1.0,0.0,1.984964,2.139811,0
