<a href="https://colab.research.google.com/github/esther0402/NN_experiments/blob/main/data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np #allows to work with arrays (np is the shortcut, which makes it faster)
import matplotlib.pyplot as plt #allows us to specifically plot charts
import pandas as pd #imports datasets, create matrix of features, vectors, etc

## Importing the dataset

In [None]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [None]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

In [None]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#We use 2 classes in sklearn : column transformer class + one hot encoder class

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
#We create an object of the column transformer class named ct
  #1st arg: transformers -> what transform? what kind of encoding? On which index?
  #2nd arg: remainder -> which columns will remain the same?
    #passthrough = we keep anything not transformed
x = np.array(ct.fit_transform(x))
#we fit and transform the result, then transform it to array form

In [None]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
#LaberEncoder will encode the dependent variable into 0s and 1s

le = LabelEncoder()
y = le.fit_transform(y)
#This doesn't have to be a NumPy array because it's the dependent variable vector (it's not expected)

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_text_split
#We use the skitlearn library -> contains 'module_selection' module -> contains 'train_test_split' function

#these are the outputs
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 1)
#input of train_test_split function ->
  #matrix of features
  #dependent variable
  #split size (80% training recommended, so 0.2 for test size)
    #This means for us, 8 customers will fo in training, and 2 in test set
  #random_state (optional?)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
#We access the sklearn library to access the preprocessing module, which contains the Standard Scaler class

sc = StandardScaler()
#It's created as an instance in the StandardScaler class, so we add ()

X_train[:,3:] = sc.fit_transform(X_train[:,3:])
#We fit our scaler (standardization tool) on the training set
#We don't scale our dummy variables (categorical data), so we input the other two remaining columns (age and salaries, from 3rd column)
#fit method will get each feature of x_train, compute the mean and standard deviation of the features (age and salary)
#transform method will apply the standardization formula

X_test[:,3:] = sc.transform(X_test[:,3:])
#We just do the transform method because fit will get the mean and sd again, which we don't need