### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing dataset

In [2]:
data = pd.read_csv('Data.csv')

In [5]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


###### separate dependent and independent variables

In [16]:
## matrix on independent variables, features
X=data.iloc[:, :-1].values ### -1 we don't include the last one

In [17]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [45]:
## matrix on dependent variables
y=data.iloc[:, 3].values ### 3 the last column

In [46]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Handling missing data

In [22]:
from sklearn.preprocessing import Imputer   ### imputer (impute = assign) class will allow us to take care of missing data

#### now that we imported the imputer class, we need to create an object of the class
###### http://lijiancheng0614.github.io/scikit-learn/modules/generated/sklearn.preprocessing.Imputer.html

In [23]:
## IN OUR DATASET MISSING VALUES ARE CALLED NaN
## for the strategey we want to replace nan by the mean  (other startegies: median, most_frequent)
## for axis=0 impute along columns, 1 impute along rows
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

###### now we fit our impute object to our features's matrix X   (in fact not the whole matrix, only the columns that contain missing data)

In [26]:
imputer = imputer.fit(X[:, 1:3]) ### second and third columns (3 because the upper bound is excluded)
X[:, 1:3] = imputer.transform(X[:, 1:3])  ### we apply our imputer object to the columns

### Encoding Categorical Data

###### why the encoding: in our dataset we have two categorical variables: country, purchase. Machine learning models are based on mathematical variables, so we have to encode the categorical variables to numbers

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [36]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  ### class labelEncoder

In [30]:
labelencoder_X = LabelEncoder()   ### object labelEncoder

In [33]:
X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) ###  Fit label encoder and return encoded labels

In [35]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

##### since ML models are based on mathematics, we are good because we transformed our categorical variable country to numerical one. However, France=0, spain=1, germany=2 , ML Model may think that spain is better than france, and germany is better than both of them, and that isn't the case. we need to figure out how to prevent the model from such misundersanding

#### we will use the DUMMY ENCODING: with the OneHotEncoder, check below
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html


In [37]:
onehotencoder = OneHotEncoder(categorical_features=[0])

In [38]:
X = onehotencoder.fit_transform(X).toarray()  ### here we don't need to specify which column, because we already did it above [0]

In [43]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

#### so we can see that the first column of countries was replaced to three columns oenfor each country, now our model we distinguish each country

#### for the predicted variable Y we only need the labelencoder class

In [44]:
labelencoder_Y = LabelEncoder()

In [47]:
y = labelencoder_Y.fit_transform(y)

In [48]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

### Splitting the dataset to train and test sets
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [53]:
#from sklearn.cross_validation import train_test_split
## cross_validation module still working but it s deprecated, now we use model_selection
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Feature Scaling

###### why: it is done so that the variance of the features are in the same range. If a feature’s variance is orders of magnitude more than the variance of other features, that particular feature might dominate other features in the dataset, which is not something we want happening in our model
###### The aim here is to to achieve Gaussian with zero mean and unit variance. There are many ways of doing this, two most popular are standardisation and normalisation.

https://medium.com/@contactsunny/why-do-we-need-feature-scaling-in-machine-learning-and-how-to-do-it-using-scikit-learn-d8314206fe73

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train) ### for the train set we fit and transform, for the test set we only transform
X_test = sc_X.transform(X_test) 

In [61]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])

In [62]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])