## Nominal

In [81]:
# One-hot encode
import numpy as np
import pandas as pd

In [82]:
fruits = np.array(
    [
     ['Apple'],
     ['Banana'],
     ['Cytrus']
    ]
)

In [83]:
df = pd.DataFrame(fruits, columns=['Name'])

In [84]:
df

Unnamed: 0,Name
0,Apple
1,Banana
2,Cytrus


In [85]:
from sklearn.preprocessing import LabelBinarizer

In [86]:
lb = LabelBinarizer()
lb.fit_transform(df)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [87]:
lb.classes_

array(['Apple', 'Banana', 'Cytrus'], dtype='<U6')

## Ordinal

In [88]:
feedback = np.array(
    [
     ['Agree'],
     ['Neutral'],
     ['Disagree']
    ]
)
df = pd.DataFrame(feedback,columns=['Reaction'])

In [89]:
df

Unnamed: 0,Reaction
0,Agree
1,Neutral
2,Disagree


In [90]:
df['Reaction'].replace(
    {
        'Agree' : 1,
        'Neutral' : 0,
        'Disagree' : -1
    }
)

0    1
1    0
2   -1
Name: Reaction, dtype: int64

## Scaling

In [91]:
# min-max scaling
arr = np.array(
    [
     [-10000],
     [5],
     [30000000]
    ]
)

In [92]:
from sklearn.preprocessing import MinMaxScaler

In [93]:
mms = MinMaxScaler(feature_range=(0,1))
mms.fit_transform(pd.DataFrame(arr))

array([[0.0000000e+00],
       [3.3338887e-04],
       [1.0000000e+00]])

In [94]:
# standard scaling
from sklearn.preprocessing import StandardScaler

In [95]:
ss = StandardScaler()
ss.fit_transform(pd.DataFrame(arr))

array([[-0.70746042],
       [-0.70675308],
       [ 1.4142135 ]])

## Outliers

In [96]:
# create one!
houses = pd.DataFrame()

houses["Bathrooms"] = [2, 1, 300]
houses["Price"] = [1500, 1200, 100000]

houses

Unnamed: 0,Bathrooms,Price
0,2,1500
1,1,1200
2,300,100000


In [97]:
# clearly house with 300 bathrooms is hotel 
# hence outlier

In [98]:
# Way 1 : remove outlier
threshold = 3
houses[houses["Bathrooms"] <= threshold]

Unnamed: 0,Bathrooms,Price
0,2,1500
1,1,1200


In [99]:
# Way 2 : mark as outlier
houses["Outlier"] = houses["Bathrooms"] > threshold
houses

Unnamed: 0,Bathrooms,Price,Outlier
0,2,1500,False
1,1,1200,False
2,300,100000,True


In [100]:
# also can use np.where
houses["Outlier"] = np.where(houses["Bathrooms"] > threshold, 1, 0)
houses

Unnamed: 0,Bathrooms,Price,Outlier
0,2,1500,0
1,1,1200,0
2,300,100000,1


In [101]:
# Detect outlier
from sklearn.covariance import EllipticEnvelope

In [102]:
ee = EllipticEnvelope(contamination=0.1)
ee.fit(houses)
ee.predict(houses)

array([ 1,  1, -1])

## Bins

In [103]:
arr = np.array(
    [
     [33],
     [67],
     [90],
     [13]
    ]
)

In [104]:
np.digitize(arr, bins=[33, 75, 100], right=True)

array([[0],
       [1],
       [2],
       [0]])

## Missing values predict

In [105]:
arr = np.array(
    [
     [101, 1010],
     [11.1, 111],
     [np.nan, 130]
    ]
)

In [106]:
# we can drop also
df = pd.DataFrame(arr)
df.dropna()

Unnamed: 0,0,1
0,101.0,1010.0
1,11.1,111.0


In [107]:
from sklearn.impute import SimpleImputer

In [108]:
# default strategy = mean
ip = SimpleImputer()
ip.fit_transform(df)

array([[ 101.  , 1010.  ],
       [  11.1 ,  111.  ],
       [  56.05,  130.  ]])

In [109]:
# default strategy = mean
ip = SimpleImputer()
ip.fit_transform(df)

array([[ 101.  , 1010.  ],
       [  11.1 ,  111.  ],
       [  56.05,  130.  ]])

In [110]:
# Since data small, can use KNN
from sklearn.impute import KNNImputer

In [111]:
KNNImputer().fit_transform(df)

array([[ 101.  , 1010.  ],
       [  11.1 ,  111.  ],
       [  56.05,  130.  ]])

$By Gaurav Kabra$