# Data Preprocessing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 1. Importing data

In [2]:
df = pd.read_csv("Data.csv")
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [4]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [6]:
df["Age"].median()

38.0

In [7]:

df["Salary"].median()

61000.0

## 2. Handling missing data

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
# filling the missing value using median
imputer = SimpleImputer(strategy='median')

# .fit method connect the imputer with our data, it calculates how it is going to fill the values 
# and what what it will fill.
imputer.fit(X[:, 1:3])
    
# .tranform method will apply the values calculated by the imputer
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [10]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 61000.0],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## 3. Encoding the categorical/text data

### 3.1 Encoding the Independent variables

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [12]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 61000.0],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [13]:
# onehotcoder will create no.of column equals which will be equal to total categories in the column we are transforming
# Ex. Here there are 3 categories ["France", "Spain", "Germany"], so it will create 3 columns
# In first row there is "France", after tranforming, it might look light [1, 0, 0]

ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough")
X = ct.fit_transform(X)

X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 61000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.0, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### 3.2 Encoding dependent variable

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [16]:
# as one column is present here, so the labelencoder will transform these values in 0s and 1s
lb = LabelEncoder()

y = lb.fit_transform(y)

In [17]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## 4. Split the data in Train and Test data

Main reson behind splitting the data into Train and Test is that our model can have new observation.
we first train the model using train data and then we evalutate it using test data i.e on new observation

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [19]:
X_train

array([[0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 52000.0]], dtype=object)

In [20]:
X_test

array([[0.0, 1.0, 0.0, 40.0, 61000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0]], dtype=object)

In [21]:
y_train

array([0, 1, 0, 1, 0, 1, 1, 0])

In [22]:
y_test

array([1, 0])

## 5. Feature Scaling

We do feature scaling after spliiting the data, we apply feature scaling on TEST and TRAIN data separately using same scaler. 
So that the TRAIN and TEST data can have different scales and model will get new observation when evaluating it on the
TEST data.

> **What is feature scaling?**
> - Feature scaling allow us to put all the features on same scale

> **Why we do that?**
> - There might be some situations when one feature is dominating other features, in that case the dominated features will not even be considered by the ML Model
> - So, to avoid this we do feature scaling.

In [23]:
X_train

array([[0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 52000.0]], dtype=object)

In [24]:
X_test

array([[0.0, 1.0, 0.0, 40.0, 61000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0]], dtype=object)

In [25]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [26]:
X_train

array([[0.0, 1.0, 0.0, 1.4810949949635102, 1.5444388458166327],
       [0.0, 0.0, 1.0, -1.5136465333143565, -1.319421265631428],
       [0.0, 1.0, 0.0, -1.1230280731042, -0.8284738179546175],
       [1.0, 0.0, 0.0, -0.21158499928050145, 0.23524565201180497],
       [1.0, 0.0, 0.0, 0.6998580745431972, 0.6443685250758137],
       [1.0, 0.0, 0.0, -0.4719973060872725, -0.5011755195034107],
       [1.0, 0.0, 0.0, 1.2206826881567392, 1.2171405473654258],
       [0.0, 0.0, 1.0, -0.08137884587711594, -0.992122967180221]],
      dtype=object)

In [27]:
X_test

array([[0.0, 1.0, 0.0, 0.17903346092965508, -0.2557017956650054],
       [0.0, 0.0, 1.0, -0.08137884587711594, -0.2557017956650054]],
      dtype=object)