### ENCODING
1. Label and Binary Encoding


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler



pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

def load():
    data = pd.read_csv("datasets/titanic.csv")
    return data


df = load()

In [2]:
le = LabelEncoder()
le.fit_transform(df["Sex"])[0:5] 
le.inverse_transform([0,1]) ##when you want to check which is which

array(['female', 'male'], dtype=object)

In [3]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

binary_cols = [col for col in df.columns if df[col].dtype not in ["int64", "float64"]
               and df[col].nunique() == 2]
##use nunique instead of unique.len (unique counts na as unique as well)

for col in binary_cols:
    label_encoder(df, col)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


2. One Hot Encoding (dummy variables)


In [4]:
df["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [5]:
df = load()
pd.get_dummies(df, columns=["Embarked"]).head() 
## drop_first=True to get rid of one column which can reduce correlation
## dummy_na=True   to encode na as a column as well

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,False,False,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,False,False,True


In [6]:
pd.get_dummies(df, columns=["Sex"], drop_first=True).head()
##this is binary coding, now sex_male

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.283,C85,C,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,True


### Feature Scaling -----------
- Many ml algorithms, for example classifiers which calculate distance,
requre the data to be normalized.
- Gradient descent converges faster with feature scaling

Using:
1. Standard Scaler
2. Robust Scaler
3. MinMax Scaler


In [7]:
##Standard Scaler
##z = (x - x.mean) / x.std

ss = StandardScaler()
df["Age_standard_scaled"] = ss.fit_transform(df[["Age"]])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_standard_scaled
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,-0.53
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,0.572
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,-0.255
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.365
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.365


In [8]:
##Robust Scaler
##Acount for outliers
##z = (x - x.median) / IQR

rs = RobustScaler()
df["Age_robust_scaled"] = rs.fit_transform(df[["Age"]])
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.354,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.384,0.487,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.309,0.836,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699,14.526,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523,1.103,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.382,0.806,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204,49.693,0.0,7.91,14.454,31.0,512.329
Age_standard_scaled,714.0,0.0,1.001,-2.017,-0.66,-0.117,0.572,3.465
Age_robust_scaled,714.0,0.095,0.813,-1.543,-0.441,0.0,0.559,2.909


In [9]:
##MinMax Scaler
##z = (x - x.min) / (x.max - x.min)
mms = MinMaxScaler()
df["Age_minmax_scaled"] = mms.fit_transform(df[["Age"]])
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.354,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.384,0.487,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.309,0.836,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699,14.526,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523,1.103,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.382,0.806,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204,49.693,0.0,7.91,14.454,31.0,512.329
Age_standard_scaled,714.0,0.0,1.001,-2.017,-0.66,-0.117,0.572,3.465
Age_robust_scaled,714.0,0.095,0.813,-1.543,-0.441,0.0,0.559,2.909
Age_minmax_scaled,714.0,0.368,0.183,0.0,0.248,0.347,0.472,1.0
