import all the necessary libraries

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


Loading the dataset and splitting the dependent and independent variables

In [38]:
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

In [39]:
dataset.head

<bound method NDFrame.head of    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes>

In [40]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [41]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

Working with missing values:here some values are missing(nan).to fix this:

In [42]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan,strategy = "mean")
imputer = imputer.fit(X[:,1:3])


Look below all the nan values are filled with the mean value of the whole row

In [43]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Now in case of feeding a data into a machine learning model all the features should be in a numerical format.In the above case the first row contry names are not in numerical format and also the dependent variable Y is also not in numerical format .So we are going to encode the label into a numerical format.

In [44]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X[:,0] = labelencoder.fit_transform(X[:,0])
Y = labelencoder.fit_transform(Y)

In [45]:
X


array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, nan],
       [0, 35.0, 58000.0],
       [2, nan, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

But here we have to apply this encoding only for Y variable.Because if we apply to X then,
the model will consider those values representing contry as a separate features.
(i.e) it will consider Germany is greater or higher than France as
the value of Germany is encoded as 1 and for France it is 0. 
Similarly for Spain and Germany(Spain > Germany).
This will completely produce a different output.In order to fix this we have to encode this by creating a dummy variables

In [46]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

all the Y labels are now encoded into a numerical format
the country names are represented as 
France = 0
Germany = 1
Spain =2

in the above steps i have applied the lable encoder to X also.So I am reassigning the value of X from the beginning 

In [47]:
X = dataset.iloc[:,:-1].values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan,strategy = "mean")
imputer = imputer.fit(X[:,1:3])


In [48]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [50]:
from sklearn.compose import make_column_transformer
encoder_onehot = ColumnTransformer([('my_ohe', OneHotEncoder(), [0])], remainder='passthrough')
X = encoder_onehot.fit_transform(X) 

In [51]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, nan],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, nan, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

All the features are encoded and ready for training.Now we have to split the dataset for training purpose

In [53]:
from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid = train_test_split(X,Y,test_size = 0.2,random_state=0)

In [54]:
x_train

array([[0.0, 1.0, 0.0, 40.0, nan],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, nan, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [55]:
x_valid

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)

In [56]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [57]:
y_valid

array([0, 0])

Feature scaling.During the model training all the features should be in the same scale.As most of the machine learning model are
based on euclidian distance while calculating the euclidian distance the features may differ in larger values.In that case the 
lowest values will not be considered

In [65]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_valid = sc_x.transform(x_valid)

In [66]:
x_train


array([[-1.        ,  2.64575131, -0.77459667,  0.25315802,         nan],
       [ 1.        , -0.37796447, -0.77459667, -0.23014365,  0.44897083],
       [-1.        , -0.37796447,  1.29099445, -1.84114924, -1.41706417],
       [-1.        , -0.37796447,  1.29099445,         nan, -1.0242147 ],
       [ 1.        , -0.37796447, -0.77459667,  1.54196248,  1.62751925],
       [-1.        , -0.37796447,  1.29099445, -0.0690431 , -0.14030338],
       [ 1.        , -0.37796447, -0.77459667,  0.89756025,  0.94003267],
       [ 1.        , -0.37796447, -0.77459667, -0.55234477, -0.43494049]])