## Load libraries

In [29]:
import numpy as np
import matplotlib as plt
import pandas as pd

## Load data

In [60]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values      # Grab values of every row, for first 3 columns
y = dataset.iloc[:, 3].values        # Grab values of every row in 4th column

## Fill in missing values

In [61]:
from sklearn.preprocessing import Imputer # Import Imputer class from sci-kit learn preprocessing

# create instance (object) from Imputer class. Strategy can be changed to median, or most_frequent.
# Axis means column wise or row wise when implementing "strategy"
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)

# Now fitting imputer object to our data set. The python syntax doesn't include the upper bound, so it is really saying 
# fit our imputer object to all rows of data from columns 1 and 2.
imputer = imputer.fit(X[:, 1:3])

# Now we are taking our imputer object (now fit to a dataset) and doing a transform where the values will be replaced with
# "strategy". Then we are replacing those values in our original X array.
X[:, 1:3] = imputer.transform(X[:, 1:3])


## Note, can do above steps with fit_transform

print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encode categorical data

In [62]:
# Import (2) different encoder class from sci-kit learn preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 

# Create an object from the LabeEncoder class
labelencoder_X = LabelEncoder() # Called _X because this will be for categorical X data
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

print(X)

# Notice that the countries France, Germany and Spain have been replaced with 0, 1, and 2.
# This will cause a problem with a machine learning model as it will think Germany is greater
# than Spain, for instance. So need to introduce dummy variables

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


In [63]:
# Use OneHotEncoder class to create dummy variables
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

display(pd.DataFrame(data=X))

# Notice that for each country, three columns now exist. If France is country, notice a 1 in 
# column 1, and 0's in the other columns. This is how to encode categorical data without giving
# false order to the values, when no order exists.

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [65]:
# Now can encode Y values with normal LabelEncoder

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

display(pd.DataFrame(data=y))

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1
