In [0]:
from sklearn.preprocessing import Imputer
#used for data preprocessing

##Missing Values

In [2]:
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis=0)
# use mean median or mode according to the need.
#axis = 0 means that we take the mean of the column.
#axis = 1 means that we are going to take the mean of the row.



In [0]:
#get data
import pandas as pd
dataset = pd.read_csv('Data.csv')

In [0]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:,3].values

In [5]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
X[:, 1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, nan],
       [35.0, 58000.0],
       [nan, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [0]:
imputer = imputer.fit(X[:, 1:3])
#index in python start at 0, so we are going to take the second column and the third column. 
#We have to give +1 for the end column so that it takes it till 2.

In [0]:
X[:, 1:3] = imputer.transform(X[:, 1:3])
#transform is used to apply the changes.

In [9]:
X[:, 1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, 63777.77777777778],
       [35.0, 58000.0],
       [38.77777777777778, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

##Categorical Data

We would only want numbers in our dataset. Let's see how we can do that with categorical data.

In [0]:
from sklearn.preprocessing import LabelEncoder
# this is used for data transformation of categorical data
labelEncoder_X = LabelEncoder()

In [11]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Let's encode the Countries column which is Spain, France and Germany and assign a number to each value.

In [0]:
X[:, 0] =  labelEncoder_X.fit_transform(X[:, 0])

In [13]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

However, there is a problem with this. Since 1 is greater than 0 and 2 is greater than 1 and 2, the algorithm might tthink that Spain nis greater than France and Germany. So this is not exactly the right way. 

Id it would had been small, large and medium, it still could had made sense since large is greater than small and medium and so on. But here it does not make any sense. 

## Dummy Encoding / One Hot Encoding

Instead of having one column here, we are going to have 3(number of categories) columns representing the value. 

In [0]:
from sklearn.preprocessing import OneHotEncoder
# this is used for data transformation of categorical data


In [0]:
#help(OneHotEncoder())
onehotencoder = OneHotEncoder(categorical_features = [0])

In [0]:
#help(OneHotEncoder())

In [17]:
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [18]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


Let's take care of the Purchased column now. So we will be working with y.
However, we do not need to use one hot encoder here, we can simplyt do this with labelEncoder as there are just two categories. 

In [0]:
labelEncoder_y = LabelEncoder()
y = labelEncoder_y.fit_transform(y)

In [20]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

Since the above code has depreciated, you can use the below code as well.

Here, you do not need to use the labelEncoder and can directly work with the one hot encoder. 

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
coltransf_X = ColumnTransformer([("one_hot_encoder", OneHotEncoder(), [0])], remainder='passthrough')
X = coltransf_X.fit_transform(X)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [22]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5
0,0.0,1.0,0.0,0.0,44.0,72000.0
1,1.0,0.0,0.0,1.0,27.0,48000.0
2,1.0,0.0,1.0,0.0,30.0,54000.0
3,1.0,0.0,0.0,1.0,38.0,61000.0
4,1.0,0.0,1.0,0.0,40.0,63777.777778
5,0.0,1.0,0.0,0.0,35.0,58000.0
6,1.0,0.0,0.0,1.0,38.777778,52000.0
7,0.0,1.0,0.0,0.0,48.0,79000.0
8,1.0,0.0,1.0,0.0,50.0,83000.0
9,0.0,1.0,0.0,0.0,37.0,67000.0


In [0]:
?ColumnTransformer

##Data Splitting

Split data into test and training set

In [0]:
#Splitting the data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

##Feature Scaling

Convert the data into the same scale

In [0]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

In [0]:
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# we have already fit the data with the train set so we do not need to fit it again
#so only transform is run on the test set

In [33]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5
0,1.0,-1.0,2.645751,-0.774597,0.263068,0.123815
1,-1.0,1.0,-0.377964,-0.774597,-0.253501,0.461756
2,1.0,-1.0,-0.377964,1.290994,-1.975398,-1.530933
3,1.0,-1.0,-0.377964,1.290994,0.052614,-1.11142
4,-1.0,1.0,-0.377964,-0.774597,1.640585,1.720297
5,1.0,-1.0,-0.377964,1.290994,-0.081312,-0.167514
6,-1.0,1.0,-0.377964,-0.774597,0.951826,0.986148
7,-1.0,1.0,-0.377964,-0.774597,-0.597881,-0.482149


In [34]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5
0,1.0,-1.0,2.645751,-0.774597,-1.458829,-0.901663
1,1.0,-1.0,2.645751,-0.774597,1.984964,2.139811


In [0]:
#we do not need to transform the