# Data Preprocessing
- Dealing with Duplicates
- Dealing with missing values
- Scaling
    - Standard Scaler
    - MinMax Scaler
- Dealing with categorical Data
    - OneHotEncoding
    - Label Encoding
    - Ordinal Encoding
- Splitting into train and test sets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    11 non-null     object 
 1   Age        11 non-null     float64
 2   Salary     11 non-null     float64
 3   Purchased  12 non-null     object 
dtypes: float64(2), object(2)
memory usage: 512.0+ bytes


In [4]:
df.nunique()

Country       3
Age          10
Salary       10
Purchased     2
dtype: int64

In [5]:
print('Countries : ', df.Country.dropna().unique())
print('Purchased : ', df.Purchased.unique())

Countries :  ['France' 'Spain' 'Germany']
Purchased :  ['No' 'Yes']


## Dealing with duplicate values
- If present then drop the duplicates

In [6]:
df.duplicated().sum()

1

In [7]:
df.drop_duplicates(inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Dealing With missing values
- If number of missing values in a column is large wrt total values then dropping the column is suitable strategy
- In numerical columns missing values can be replaced by the mean or median of that column
- In categorical Columns missing values can be replaced by mode of that column
- If number of rows having missing values is small wrt total rows then all these rows can be dropped

In [8]:
df.isnull().sum()

Country      1
Age          1
Salary       1
Purchased    0
dtype: int64

In [9]:
# Using Pandas
df.Age.replace(np.nan, df.Age.mean(), inplace = True)
df.Salary.replace(np.nan, df.Salary.mean(), inplace = True)
df.Country.replace(np.nan, df.Country.dropna().mode()[0], inplace = True)

df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Using Sklearn

In [10]:
df2 = pd.read_csv('Data.csv')
df2.drop_duplicates(inplace = True)
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [11]:
from sklearn.impute import SimpleImputer                                          # import the class
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')               # create an object
df2[['Age', 'Salary']] = imputer.fit_transform(df2[['Age', 'Salary']])            # tranform the data using object
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [12]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
df2[['Country']] = imputer.fit_transform(df2[['Country']])            
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# -------------------------------------------------------------------------------------------
# Scaling
  - Used to bring data to same scale
        1.Standard Scaler
        2.Min Max Scaler
        
#### Standard Scaler
  - X_scaled = (X - X_mean) / X_std
  - Performs z score normalization
  - Zero mean
  - Unit variance
  
#### MinMax Scaler
  - X_scaled = (X - X_min) / (X_max - X_min)
  - Limits the data between 0 to 1

In [13]:
# Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.447968,0.716274,No
1,Spain,-1.667438,-1.545645,Yes
2,Germany,-1.294131,-0.980165,No
3,Spain,-0.298646,-0.320439,No
4,Germany,-0.049774,0.0,Yes
5,France,-0.671953,-0.603178,Yes
6,Spain,0.0,-1.168658,No
7,France,0.945711,1.376001,Yes
8,Germany,1.194582,1.752987,No
9,France,-0.423081,0.245041,Yes


In [14]:
print(df.Age.mean(), df.Age.var())
print(df.Salary.mean(), df.Salary.var())

1.2111523905001707e-16 1.0999999999999996
2.0185873175002847e-17 1.1


In [15]:
# Min Max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df2[['Age', 'Salary']] = scaler.fit_transform(df2[['Age', 'Salary']])
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.607143,0.685714,No
1,Spain,0.0,0.0,Yes
2,Germany,0.107143,0.171429,No
3,Spain,0.392857,0.371429,No
4,Germany,0.464286,0.468571,Yes
5,France,0.285714,0.285714,Yes
6,Spain,0.478571,0.114286,No
7,France,0.75,0.885714,Yes
8,Germany,0.821429,1.0,No
9,France,0.357143,0.542857,Yes


In [38]:
print('Min and Max in  Age after scaling    : ',df2.Age.min(), df2.Age.max())
print('Min and Max in  Salary after scaling : ',df2.Salary.min(), df2.Salary.max())

Min and Max in  Age after scaling    :  0.0 1.0
Min and Max in  Salary after scaling :  0.0 1.0


# Dealing with Categorical Values
   - Label Encoding
   - Ordinal Encoding
   - One Hot Encoding

In [16]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.447968,0.716274,No
1,Spain,-1.667438,-1.545645,Yes
2,Germany,-1.294131,-0.980165,No
3,Spain,-0.298646,-0.320439,No
4,Germany,-0.049774,0.0,Yes
5,France,-0.671953,-0.603178,Yes
6,Spain,0.0,-1.168658,No
7,France,0.945711,1.376001,Yes
8,Germany,1.194582,1.752987,No
9,France,-0.423081,0.245041,Yes


In [17]:
# Using Pandas
df.Country = df.Country.map({'France' : 0, 'Germany' : 1, 'Spain' : 2})
df.Purchased = df.Purchased.map({'No' : 0, 'Yes' : 1})

df

Unnamed: 0,Country,Age,Salary,Purchased
0,0,0.447968,0.716274,0
1,2,-1.667438,-1.545645,1
2,1,-1.294131,-0.980165,0
3,2,-0.298646,-0.320439,0
4,1,-0.049774,0.0,1
5,0,-0.671953,-0.603178,1
6,2,0.0,-1.168658,0
7,0,0.945711,1.376001,1
8,1,1.194582,1.752987,0
9,0,-0.423081,0.245041,1


In [18]:
# One Hot Encoding using pandas
df3 = pd.read_csv('Data.csv')
df3.drop_duplicates(inplace = True)
df3

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
dummy = pd.get_dummies(df3.Country)
dummy

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [20]:
df3 = pd.concat([dummy, df3], axis = 1)
df3.drop('Country', axis = 1, inplace = True)
df3

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,No
1,0,0,1,27.0,48000.0,Yes
2,0,1,0,30.0,54000.0,No
3,0,0,1,38.0,61000.0,No
4,0,1,0,40.0,,Yes
5,1,0,0,35.0,58000.0,Yes
6,0,0,1,,52000.0,No
7,1,0,0,48.0,79000.0,Yes
8,0,1,0,50.0,83000.0,No
9,1,0,0,37.0,67000.0,Yes


## Using Sklearn

In [21]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(df2[['Purchased']])

  y = column_or_1d(y, warn=True)


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1])

In [22]:
# Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoder.fit_transform(df2[['Country']])

array([[0.],
       [2.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [23]:
# OneHotEncoding using Sklearn
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder.fit_transform(df2[['Country']])

<11x3 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [24]:
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.607143,0.685714,No
1,Spain,0.0,0.0,Yes
2,Germany,0.107143,0.171429,No
3,Spain,0.392857,0.371429,No
4,Germany,0.464286,0.468571,Yes
5,France,0.285714,0.285714,Yes
6,Spain,0.478571,0.114286,No
7,France,0.75,0.885714,Yes
8,Germany,0.821429,1.0,No
9,France,0.357143,0.542857,Yes


In [25]:
# Separating Independent and dependent feature(s)
X = df2[['Country', 'Age', 'Salary']].values
X             # Independent feature set

array([['France', 0.6071428571428572, 0.6857142857142855],
       ['Spain', 0.0, 0.0],
       ['Germany', 0.1071428571428572, 0.17142857142857149],
       ['Spain', 0.3928571428571428, 0.37142857142857144],
       ['Germany', 0.4642857142857142, 0.4685714285714284],
       ['France', 0.2857142857142858, 0.2857142857142856],
       ['Spain', 0.4785714285714284, 0.11428571428571432],
       ['France', 0.75, 0.8857142857142857],
       ['Germany', 0.8214285714285714, 1.0],
       ['France', 0.3571428571428572, 0.5428571428571427],
       ['France', 1.0, 0.6285714285714286]], dtype=object)

In [26]:
Y = df2[['Purchased']].values
Y          # Dependent feature

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes']], dtype=object)

In [27]:
# Column Transformer
from sklearn.compose import ColumnTransformer
tranformer = ColumnTransformer(transformers = [('Encoder', OneHotEncoder(),[0])], remainder = 'passthrough')
tranformer.fit_transform(X)

array([[1.0, 0.0, 0.0, 0.6071428571428572, 0.6857142857142855],
       [0.0, 0.0, 1.0, 0.0, 0.0],
       [0.0, 1.0, 0.0, 0.1071428571428572, 0.17142857142857149],
       [0.0, 0.0, 1.0, 0.3928571428571428, 0.37142857142857144],
       [0.0, 1.0, 0.0, 0.4642857142857142, 0.4685714285714284],
       [1.0, 0.0, 0.0, 0.2857142857142858, 0.2857142857142856],
       [0.0, 0.0, 1.0, 0.4785714285714284, 0.11428571428571432],
       [1.0, 0.0, 0.0, 0.75, 0.8857142857142857],
       [0.0, 1.0, 0.0, 0.8214285714285714, 1.0],
       [1.0, 0.0, 0.0, 0.3571428571428572, 0.5428571428571427],
       [1.0, 0.0, 0.0, 1.0, 0.6285714285714286]], dtype=object)

### Applying Multiple Transformations using Column Transformer

In [28]:
df4 = pd.read_csv('Data.csv')
df4.drop_duplicates(inplace = True)

df4.Age.replace(np.nan, df4.Age.mean(), inplace = True)
df4.Salary.replace(np.nan, df4.Salary.mean(), inplace = True)
df4.Country.replace(np.nan, df4.Country.dropna().mode()[0], inplace = True)
df4

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

transformer = ColumnTransformer(transformers = [('Encoder1', OneHotEncoder(), [0]),
                                                ('Scaler'  , StandardScaler(), [1,2]),
                                                ('Encoder2', OrdinalEncoder(), [3])],
                                                remainder = 'passthrough')

df4 = transformer.fit_transform(df4)
np.set_printoptions(linewidth = 200)
df4

array([[ 1.        ,  0.        ,  0.        ,  0.44796839,  0.71627446,  0.        ],
       [ 0.        ,  0.        ,  1.        , -1.66743788, -1.54564488,  1.        ],
       [ 0.        ,  1.        ,  0.        , -1.2941309 , -0.98016504,  0.        ],
       [ 0.        ,  0.        ,  1.        , -0.29864559, -0.32043857,  0.        ],
       [ 0.        ,  1.        ,  0.        , -0.04977427,  0.        ,  1.        ],
       [ 1.        ,  0.        ,  0.        , -0.67195258, -0.60317849,  1.        ],
       [ 0.        ,  0.        ,  1.        ,  0.        , -1.16865832,  0.        ],
       [ 1.        ,  0.        ,  0.        ,  0.94571104,  1.37600093,  1.        ],
       [ 0.        ,  1.        ,  0.        ,  1.19458237,  1.75298748,  0.        ],
       [ 1.        ,  0.        ,  0.        , -0.42308125,  0.24504126,  1.        ],
       [ 1.        ,  0.        ,  0.        ,  1.81676068,  0.52778118,  1.        ]])

## Splitting the data into training and test sets

In [30]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,0,0.447968,0.716274,0
1,2,-1.667438,-1.545645,1
2,1,-1.294131,-0.980165,0
3,2,-0.298646,-0.320439,0
4,1,-0.049774,0.0,1
5,0,-0.671953,-0.603178,1
6,2,0.0,-1.168658,0
7,0,0.945711,1.376001,1
8,1,1.194582,1.752987,0
9,0,-0.423081,0.245041,1


In [31]:
X = df[['Country', 'Age', 'Salary']]
Y = df[['Purchased']]
X

Unnamed: 0,Country,Age,Salary
0,0,0.447968,0.716274
1,2,-1.667438,-1.545645
2,1,-1.294131,-0.980165
3,2,-0.298646,-0.320439
4,1,-0.049774,0.0
5,0,-0.671953,-0.603178
6,2,0.0,-1.168658
7,0,0.945711,1.376001
8,1,1.194582,1.752987
9,0,-0.423081,0.245041


In [32]:
Y

Unnamed: 0,Purchased
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

In [34]:
x_train

Unnamed: 0,Country,Age,Salary
6,2,0.0,-1.168658
4,1,-0.049774,0.0
9,0,-0.423081,0.245041
3,2,-0.298646,-0.320439
11,0,1.816761,0.527781
8,1,1.194582,1.752987
0,0,0.447968,0.716274


In [35]:
y_train

Unnamed: 0,Purchased
6,0
4,1
9,1
3,0
11,1
8,0
0,0


In [36]:
x_test

Unnamed: 0,Country,Age,Salary
2,1,-1.294131,-0.980165
5,0,-0.671953,-0.603178
1,2,-1.667438,-1.545645
7,0,0.945711,1.376001


In [37]:
y_test

Unnamed: 0,Purchased
2,0
5,1
1,1
7,1
