In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd

In [2]:
dataset = pd.read_csv("data.csv")

In [3]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

Select all rows: ```:```  
Select all columns except last: ```:-1```

In [4]:
dataset.iloc[:, :-1]

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


Select all rows  
Select the 4th column (the classes)

In [7]:
dataset.iloc[:, 3]

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

Take all the rows, but start from the second column till the last one.  
This is done to ensure the mean isn't applied on strings as well (first column)

In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean", verbose=0)
imputer = imputer.fit(x[:, 1:-1])
x[:, 1:-1] = imputer.transform(x[:, 1:-1])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]




In [25]:
# Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# We could do this, see below cell for reason why we don't
# label_encode_x = LabelEncoder()
# x[:, 0] = label_encode_x.fit_transform(x[:, 0])

column_transformer = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder="passthrough")
x = column_transformer.fit_transform(x)

label_encode_y = LabelEncoder()
y = label_encode_y.fit_transform(y)

Data for countries is encoded into separate columns.  

We use ColumnTransformer with OneHotEncoder to make that happen. The reason we do not use LabelEncoder (we could) is because the model can interpret albeit numerically labeled data, with intepretations where there could be correlation (when there is not).

In [26]:
print(x)

[[1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]


In [15]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [28]:
# Splitting the data
from sklearn.model_selection import train_test_split

# Pass in test_size=0.2 for 20% of data used for training
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [29]:
# Feature scaling (standardising the data)
from sklearn.preprocessing import StandardScaler
standard_scaler_x = StandardScaler()
x_train = standard_scaler_x.fit_transform(x_train)
x_test = standard_scaler_x.fit_transform(x_test)

In [33]:
print(x)
print(x_train)

[[1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]
[[-1.          1.         -1.          2.64575131 -0.77459667  0.26306757
   0.12381479]
 [ 1.         -1.          1.         -0.37796447 -0.77459667 -0.25350148
   0.46175632]
 [-1.          1.         -1.         -0.37796447  1.29099445 -1.97539832
  -1.53093341]
 [-1.          1.         -1.         -0.37796447  1.29099445  0.05261351
  -1.11141978]
 [ 1.         -1.          1.         -0.37796447 -0.77459667  1.64058505
   1.7202972 ]
 [-1.          1.         -1.         -0.37796447  1.29099445 -0.0813118
  -0.16751412]
 [ 1.         -1.          1.         -0.37796447 -0.77459667  0.95182631
   0.9861