# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

# numpy - helps to work with arrays, matplotlib - visualization, pandas - data preprocessing

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# X: features - independent variable - these are the data used to predict our result
# y: result - dependent variable - generally last column - the value which is to be predicted by our model
# iloc - locating the values [rows, columns]; colon indicates all rows of the selected columns
# [:-1] means except last column as python excludes the upper bound in range & -1 is the index of the last column

In [3]:
print(X)

# results shows a matrix of all features/independent variables x

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

# results shows dependent variable vector y

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

If the dataset is too large, instead of checking the dataset, just apply the below code bcoz even if there is no missing values, then it'll leave the dataset the same.

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# imputing is basically a strategy to handle missing data
# SimpleImputer is a class, so need an instance(object) ['imputer' in our case] of this to make use of it
#fit is a method of SimpleImputer class & it calculates the required values[in our case 'mean'],
# however it requires all numerical col as arguments, hence [1:3]
# transform func will replace the missing values in those columns with the calculated mean & return those completed columns
# then we just update those numerical columns in our original 'X' dataframe

In [6]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

#approach is splitting the categorical column into each individual columns, by having their own binary
# values, eg. france - 1 0 0, spain - 0 1 0, germany - 0 0 1
#basically assigning numerical values to our categorical data, however not such simple order like 1,2,3 
# that the model thinks these values has a pattern or something
#ct - object of ColumnTransformer class, arguments -> transformers - '0' specifies the index of col to be encoded
# remainder - 'passthrough' specifies that we need to keep the other col which are not categorical
#ct has a method c/a fit_transform which does both tasks of fit & transform in one, however X is 
# a numpy array but the fit_transform doesn't return a numpy array, which will cause an error

In [8]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# basically converting those yes & no in the last column to 1 & 0, so machine can easily understand it
# we use LableEncoder() class for this
# no need to convert this to numpy array

In [11]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [24]:
# why splitting before feature scaling?
# feature scaling will scale the dataset using mean & st deviation such that 1 feature doesn't dominate.
#Feature scaling should be done after dataset splitting because the model will be fitted only with the training
# set and thus the parameters of scaling should be generated only from the training set too 
# then using these generated parameters from the training set, testing set can be scaled
# new scaling for testing set should not be done

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# so train_test_split class returns 4 variables shown above after providing these arguments
# random_state = 42 is just the industry standard that signifies to select the random values for splitting

In [19]:
print(X_train)

[[1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]]


In [20]:
print(X_test)

[[0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 27.0 48000.0]]


In [21]:
print(y_train)

[1 0 1 0 1 1 0 0]


In [22]:
print(y_test)

[0 1]


## Feature Scaling

In [25]:
# no need to apply to every model, will only be use in certain model techniques
# bcoz many models have constant coeff, they compensate if the value is an outlier
# basically uses two techniques: standardisation(values b/w -3 & 3) & normalisation(values b/w 0 & 1) to scale the values
# always go for standardisation bcoz it works well in all cases while normalisa. only works on normal distributions

In [27]:
#feature scaling should not be applied to the dummy variables created through encoding categorical
# data bcoz then that values (binary in our case before) will lose their meaning & also they are in
# 0s & 1s which is in the range of -3 & +3 so no need.
# so, only apply feature scaling for the numerical features which contain non-dummy values

In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:])

# why columns selected from 3 onwards? bcoz, the dummy variables columns of X_train has the index 0,1,2.
#why only transform for X_test? bcoz if it's fit again then new scaler will be generated, but these
# data should only be scaled using the fit from the training set, so they are congruent with each other

In [None]:
# Notice that we only applied feature scaling to X (features) but not to y (dependant var),
# bcoz in this dataset y is 0 or 1 only, hence no need as already in the scale range.

In [28]:
print(X_train)

[[1.0 0.0 0.0 -0.7529426005471072 -0.6260377781240918]
 [1.0 0.0 0.0 1.008453807952985 1.0130429500553495]
 [1.0 0.0 0.0 1.7912966561752484 1.8325833141450703]
 [0.0 1.0 0.0 -1.7314961608249362 -1.0943465576039322]
 [1.0 0.0 0.0 -0.3615211764359756 0.42765697570554906]
 [0.0 1.0 0.0 0.22561095973072184 0.05040823668012247]
 [0.0 0.0 1.0 -0.16581046438040975 -0.27480619351421154]
 [0.0 0.0 1.0 -0.013591021670525094 -1.3285009473438525]]


In [29]:
print(X_test)

[[0.0 1.0 0.0 2.1827180802863797 2.3008920936249107]
 [0.0 0.0 1.0 -2.3186282969916334 -1.7968097268236927]]
