In [1]:
# Data Preprocessing stage:
# What is the need of this stage?
# The final goal is to create a model/data product/inference from the given data 
#
# All your formulae in Statistics expects your data to be in the form of numbers
#
# Rule to create a model/data product/inference:
#
# 1. Data must be Strictly Numeric
# 2. Data must be COMPLETE
# 

In [2]:
#Preprocessing Task:
#1. Checking and Handling Missing Data on all Columns
#2. Check and Handle Categorical Columns
#3. Check and Handle Ordinal Columns
#4. Perform Standardization of all Columns

In [4]:
# 1. Checking and Handling Missing Data on all Columns
#
# Two Perspectives ====> Statistical Perspective
#                        Domain Prespective
#
# Rules to Handle Missing Data:
# ==============================================================
# 1. Statistical Perspective
# ==============================================================
# a. Numerical Columns
#       Continous -----> Replace the missing data with the Mean value of the column
#       Discrete ------> Replace the missing data with the Median Value of the column
#
# b. Non Numerical Columns
#       Replace the missing data with the Mode of the column

# ==================================================================
# 2. Domain Perspective
# ==================================================================
# 1. Replace the missing data with the default value specified by Domain
# 2. Replcae the missing data with the min value of the specified column whereeever applicable
#
# Example on Domain Perspective:
#
# housing data (Mumbai !!!)
# no_of_bedrooms -----> Numerical ---------> Domain perspective (2)
# np_parking ---------> Numerical ---------> Domain perspective (1)
# area_sqft ----------> Numerical ---------> 768sqt
# city ---------------> Categorical -----> Raheja (Choose a city where Raheja has more number of building)

In [5]:
#2. Check and Handle Categorical Columns
#
# Why do we need to handle Categorical Columns?
# Stat formulae expect the data to be numeric. Mostly all categorical variables are non-numeric
# Categorical means the data will not have any mathematical weightage
# So we need a strategy to ensure we can represent the data in numbers without introducing
# mathematical weightage

# Solution: Use of Dummy Variables | Dummy Columns
#
# Algo: -- Refer my Whiteboard notes

In [6]:
import numpy as np
import pandas as pd

In [7]:
data = pd.read_csv('pre-process_datasample.csv')

In [8]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [9]:
#1. Checking and Handling Missing Data on all Columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      9 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [10]:
# Perspective: Stat
#Country      9 non-null object ----> No of missing values: 1 -- Action: Replace NaN with Mode
#Age          9 non-null float64 ---> No of Missing values: 1 -- Action: Replace NaN with mean
#Salary       9 non-null float64 ---> No of Missing values: 1 -- Action: Replace NaN with mean

In [12]:
#Identifying Null values in a column using isna()
data[data.Country.isna()]

Unnamed: 0,Country,Age,Salary,Purchased
8,,50.0,83000.0,No


In [18]:
data.Country.mode()[0]

'France'

In [16]:
data.Country.fillna(data.Country.mode()[0] , inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [20]:
data.Age.fillna(data.Age.mean(), inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [21]:
data.Salary.fillna(int(data.Salary.mean()) , inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [22]:
#2. Check and Handle Categorical Columns
# Dummy Variable

In [25]:
dataFinal = pd.concat([pd.get_dummies(data.Country),data.iloc[:,[1,2,3]]] , axis = 1)
dataFinal

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,No
1,0,0,1,27.0,48000.0,Yes
2,0,1,0,30.0,54000.0,No
3,0,0,1,38.0,61000.0,No
4,0,1,0,40.0,63777.0,Yes
5,1,0,0,35.0,58000.0,Yes
6,0,0,1,38.777778,52000.0,No
7,1,0,0,48.0,79000.0,Yes
8,1,0,0,50.0,83000.0,No
9,1,0,0,37.0,67000.0,Yes


In [27]:
dataFinal.Purchased.replace(['No','Yes'] , [0,1] , inplace=True)
dataFinal

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,0
1,0,0,1,27.0,48000.0,1
2,0,1,0,30.0,54000.0,0
3,0,0,1,38.0,61000.0,0
4,0,1,0,40.0,63777.0,1
5,1,0,0,35.0,58000.0,1
6,0,0,1,38.777778,52000.0,0
7,1,0,0,48.0,79000.0,1
8,1,0,0,50.0,83000.0,0
9,1,0,0,37.0,67000.0,1


In [None]:
#If any column has Binary Data , Replcae that with 0 and 1 respectively