In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [4]:
data_set = pd.read_csv("./data.csv")
data_set.head()

Unnamed: 0,State,Age,Salary,Purchased
0,California,42.0,60000.0,No
1,Florida,29.0,55000.0,Yes
2,Texas,27.0,72000.0,No
3,Florida,26.0,48000.0,No
4,Texas,39.0,65111.11111,Yes


In [5]:
X = data_set.iloc[:,:-1].values
X

array([['California', 42.0, 60000.0],
       ['Florida', 29.0, 55000.0],
       ['Texas', 27.0, 72000.0],
       ['Florida', 26.0, 48000.0],
       ['Texas', 39.0, 65111.111110000005],
       ['California', 34.0, 54000.0],
       ['Florida', 36.88888889, 57000.0],
       ['California', 46.0, 80000.0],
       ['Texas', 51.0, 85000.0],
       ['California', 38.0, 75000.0]], dtype=object)

In [6]:
Y = data_set.iloc[:, 3].values
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [7]:
data_set.describe()

Unnamed: 0,Age,Salary
count,10.0,10.0
mean,36.888889,65111.111111
std,8.171012,12350.838428
min,26.0,48000.0
25%,30.25,55500.0
50%,37.444444,62555.555555
75%,41.25,74250.0
max,51.0,85000.0


In [9]:
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

In [12]:
#[rows, cols]
# lower bound included
# upper bound always included
imputer = imputer.fit(X[:,1:3])

In [13]:
X

array([['California', 42.0, 60000.0],
       ['Florida', 29.0, 55000.0],
       ['Texas', 27.0, 72000.0],
       ['Florida', 26.0, 48000.0],
       ['Texas', 39.0, 65111.111110000005],
       ['California', 34.0, 54000.0],
       ['Florida', 36.88888889, 57000.0],
       ['California', 46.0, 80000.0],
       ['Texas', 51.0, 85000.0],
       ['California', 38.0, 75000.0]], dtype=object)

In [14]:
X[:,1:3] = imputer.transform(X[:,1:3])
X

array([['California', 42.0, 60000.0],
       ['Florida', 29.0, 55000.0],
       ['Texas', 27.0, 72000.0],
       ['Florida', 26.0, 48000.0],
       ['Texas', 39.0, 65111.111110000005],
       ['California', 34.0, 54000.0],
       ['Florida', 36.88888889, 57000.0],
       ['California', 46.0, 80000.0],
       ['Texas', 51.0, 85000.0],
       ['California', 38.0, 75000.0]], dtype=object)

In [18]:
# Shift + Tab to see info
ct = ColumnTransformer(transformers=[('one_hot_encoder',OneHotEncoder(categories='auto'),[0])],
                      remainder='passthrough')

In [21]:
X = np.array(ct.fit_transform(X), dtype=np.float)
X

array([[1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.20000000e+01, 6.00000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 2.90000000e+01, 5.50000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 2.70000000e+01, 7.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 2.60000000e+01, 4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 3.90000000e+01, 6.51111111e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.40000000e+01, 5.40000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 3.68888889e+01, 5.70000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4

In [23]:
le = LabelEncoder()

In [24]:
Y = le.fit_transform(Y)
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [27]:
#Split data set into train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=0)
X_train, X_test, Y_train, Y_test

(array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         1.00000000e+00, 3.90000000e+01, 6.51111111e+04],
        [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 3.80000000e+01, 7.50000000e+04],
        [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
         0.00000000e+00, 2.90000000e+01, 5.50000000e+04],
        [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
         0.00000000e+00, 3.68888889e+01, 5.70000000e+04],
        [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 4.60000000e+01, 8.00000000e+04],
        [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
         0.00000000e+00, 2.60000000e+01, 4.80000000e+04],
        [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 4.20000000e+01, 6.00000000e+04],
        [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
         0

In [30]:
# Feature Scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_train, X_test

(array([[-1.        ,  1.        , -1.        , -0.77459667,  2.64575131,
          0.42967955,  0.32661014],
        [ 1.        , -1.        ,  1.        , -0.77459667, -0.37796447,
          0.26685361,  1.29153305],
        [-1.        ,  1.        , -1.        ,  1.29099445, -0.37796447,
         -1.19857978, -0.65999643],
        [-1.        ,  1.        , -1.        ,  1.29099445, -0.37796447,
          0.08593591, -0.46484348],
        [ 1.        , -1.        ,  1.        , -0.77459667, -0.37796447,
          1.56946108,  1.77941542],
        [-1.        ,  1.        , -1.        ,  1.29099445, -0.37796447,
         -1.68705758, -1.34303174],
        [ 1.        , -1.        ,  1.        , -0.77459667, -0.37796447,
          0.91815734, -0.17211406],
        [ 1.        , -1.        ,  1.        , -0.77459667, -0.37796447,
         -0.38445012, -0.7575729 ]]),
 array([[-1.        ,  1.        , -1.        , -0.77459667,  2.64575131,
         -1.52423165,  0.99880363],
        