# Data Preprocessing Tools

## Importing the libraries

## Importing the dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#import dataset using pandas data frame
dataset = pd.read_csv('Data.csv')
#create matrix features

#create independent variable vector
# each model has: features, independent variable vector 
# features are the columns with which you will predict
# here, dependent variable is 'purchased' cuz company would like
# to predict if some future cutomers will purchase certain features
# based on info of previous columns (Country, Age, Salary: Purchased)
# those features are the independent variables which predictions are based on

# when specify in pos1 : without upper, lower bounds, will get all rows
# when specify in pos2 :-1 means index of last column
# : includes lower bound and -1 excludes upper bound
X = dataset.iloc[:, :-1].values
# isolate last column, dependent variable vector by remove range
# to include only last column
y = dataset.iloc[:, -1].values


In [3]:
depVector = pd.read_csv('Data.csv', nrows=10, usecols = ['Purchased'])

In [4]:
depVector

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


In [5]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [7]:
# missing data can impact accuracy of the model
# can delete the row with missing info or
# take the average of all rows in that column 
# to fill in the missing data


In [8]:
# replace missing salary by average of all present salaries
# introducing sci-kit learn 
from sklearn.impute import SimpleImputer
# specify what you want to replace
# numpy.nan = missing values // strategy='mean' refers to avg
# args to imputer(missing_values, strategy)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer is object. nothing connected to matrix features yet 
# expects all columns of x with numerical values, no text, str, categories
# get X[:(allRows), 1:2(age, salary)]
# if include all numerical cols with missing values, this will apply
# averages to any columns with missing data. catch all
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [9]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [10]:
# Replace categorical variables (strings) 
# with 0s and 1s, called one-hot encoding
# also apply one-hot encoding on independent var column 

### Encoding the Independent Variable

In [11]:
# using column transformer class from compose module of sklearn
# one-hot encoder from preprocessing module of sklearn

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 2 args: what kind of transformation+ on which index columns to apply
# here, kind=encoding, how=one-hot, columnIndex to be encoded
# enter tuple in format tranformers=[()]
# 2nd arg: remainder, specifying that we want to keep columns 
# that won't have tranformations applied to them
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])]  , remainder='passthrough')
# apply transformation to X, the matrix of features X
# inside which we want to encode the country column
# this must return X as a numpy array, expected format for our
# future machine learning models so wrap in np.array
X = np.array(ct.fit_transform(X))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])]  , remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [13]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [14]:
from sklearn.preprocessing import LabelEncoder
# create object of LE
le = LabelEncoder()
# user fit_transform on y
# np.array not needed on dependent variable vector
y= le.fit_transform(y)

In [15]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [16]:
# Do we apply feaure scaling before splitting the training set 
# or after trianing. 
# Does scaling refer to the weight, impact, importance
# of a feature in making predictions?
# Answer is to apply feature scaling after splitting train/ test data
# if applied before, it will actually get the mean and standard 
# deviation of all the values, including one in test set.
# Test set is something you're not supposed to have, representative of
# future data the model will be exposed to and expected to make accurate
# predictions on. Doing so will cause 'Information Leakage' on test set

In [17]:
# Will use Model Selection library to train, test, split
# we'll create a pair of matrix features, dep variable for training set
# and another pair of matrix features, dep variable for the test set
# X-train(matrix feats of training set)
# X-test(matrix feats of test set)
# Y-train(dep variable of the training set)
# Y-test(dep variable of the test set)
# Models will expect X train, Y train as input in a method called
# The FIT method
# For predictions (inference) these models will predict X test

In [18]:
from sklearn.model_selection import train_test_split
# X_train is all col1(1-hot encoded), ages and salaries of training set
# X_test is matrix features of the test set
# y_train is dep variable of training set (purchased, train set)
# y_test is dep variable of test set (purchased, test set)
# train_test_split args: (matrix feats, dep var, split size(80,20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [19]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [20]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [21]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [22]:
print(y_test)

[0 1]


## Feature Scaling

In [23]:
# This is done to avoid some features weighing so much more than others 
# that they may not even be considered by the model for predictions.
# Not required by all models
# Multi linear regression: var multip by coefficient
# if some vars take much higher values than others, when learning coefficients
# coeffs compensate by taking small values for the variables that take high values

In [24]:
# How feat scaling is applied: Standardisation and Normalisation

In [25]:
# Standardisation: subtract each value of your feature by the mean of 
# all values of the feature, then divide by standard deviation
# which is square root of the variance
# this wil put all values of the feature between around -3 and +3
# what does it mean that after standardisation, all features take a value
# between -3 and +3?

In [26]:
# Normalisation: subtract each value of your features by
# the minimum value of the feature
# then divide by difference between max and min value of the feature
# x-min(x) positive / max(x) - min(x) positive and 2nd always larger
# makes all values of your features between 0-1

In [27]:
# Use cases - 
# Normalisation - Most of the features follow normal distribution 
# Standardization - Works well all the time, always improves 
# training process 
# Won't apply feature scaling on the whole matrix of features X,
# but on X-train and X_test separately
# Scaler will be fitted to only X_train and then we'll transform
# X_test, we'll apply feature scaling on X_test
# Since X_test is something we're not suppposed to have during the training
# but only after, like when going in production
# SOOO we're not allowed to fit our feature scaling on the test set
# We will get the mean of xtrain, stdev of xtrain and then apply formula
# to transform all the values in xtrain 
# and THEN apply that ratio to the values in x_test

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() 
# Do we have to apply feature scaling to the dummy variables in
# matrix of features?? NO
# The goal of feature scaling is to have all the values of the features
# in the same range. 
# Standardisation tranforms your features so that they take values
# between more or less -3 or +3
# Applying to dummy vars would skew interpretability of one-hot encoding
sc.fit(X_train)
sc.fit(X_test)
# !Fit will get mean, stdev of features
# !Transform will apply forumla, transform values so they can all be
# in same scale
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:])

In [29]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [30]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
