Encoding categorical features in scikit-learn

The preprocessing module of scikit-learn supports various ordinal, one-hot, and binary encoding methods using the OrdinalEncoder() and OneHotEncoder() functions respectively. A common parameter for OneHotEncode() is drop, which removes one of the binary features. This parameter is often set to 'first' but can also be set to an array that indicates specific features to be dropped. Each function needs to be initiated and then fitted to the data. Additional details can be found in the OrdinalEncoder() documentation and OneHotEncoder() documentation.

In [1]:
# Import all libraries
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pandas as pd
import numpy as np

In [13]:
# Load the abalone.csv data
abalone = pd.read_csv('abalone.csv')
abalone_sex = abalone[['sex']]

In [14]:
# Initialize the OrdinalEncoder() function and fit to the abalone_sex data
ordinal_encoder = OrdinalEncoder()
labels = pd.DataFrame(ordinal_encoder.fit_transform(abalone_sex[['sex']]),columns=['label'])
abalone_sex = abalone_sex.join(labels)

In [15]:
abalone_sex

Unnamed: 0,sex,label
0,M,2.0
1,M,2.0
2,F,0.0
3,M,2.0
4,I,1.0
...,...,...
4172,F,0.0
4173,M,2.0
4174,M,2.0
4175,F,0.0


In [5]:
data = np.asarray([['M'], ['F'], ['I']])

In [6]:
result = ordinal_encoder.fit_transform(data)

In [7]:
data

array([['M'],
       ['F'],
       ['I']], dtype='<U1')

In [8]:
result

array([[2.],
       [0.],
       [1.]])

In [9]:
# Initialize the OneHotEncoder() function and fit to the abalone_sex data
onehot_encoder = OneHotEncoder(sparse=False)
onehot_labels = pd.DataFrame(onehot_encoder.fit_transform(abalone_sex[['sex']]),columns=['F','I','M'])
abalone_sex = abalone_sex.join(onehot_labels)
abalone_sex



Unnamed: 0,sex,label,F,I,M
0,M,2.0,0.0,0.0,1.0
1,M,2.0,0.0,0.0,1.0
2,F,0.0,1.0,0.0,0.0
3,M,2.0,0.0,0.0,1.0
4,I,1.0,0.0,1.0,0.0
...,...,...,...,...,...
4172,F,0.0,1.0,0.0,0.0
4173,M,2.0,0.0,0.0,1.0
4174,M,2.0,0.0,0.0,1.0
4175,F,0.0,1.0,0.0,0.0


In [10]:
data

array([['M'],
       ['F'],
       ['I']], dtype='<U1')

In [11]:
onehot_encoder.fit_transform(data)



array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [12]:
# Initialize a binary encoder function and fit to the abalone_sex data
binary_encoder = OneHotEncoder(drop='first', sparse=False)
binary_labels = pd.DataFrame(binary_encoder.fit_transform(abalone_sex[['sex']]),columns=['D1','D2'])
abalone_sex = abalone_sex.join(binary_labels)
abalone_sex



Unnamed: 0,sex,label,F,I,M,D1,D2
0,M,2.0,0.0,0.0,1.0,0.0,1.0
1,M,2.0,0.0,0.0,1.0,0.0,1.0
2,F,0.0,1.0,0.0,0.0,0.0,0.0
3,M,2.0,0.0,0.0,1.0,0.0,1.0
4,I,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
4172,F,0.0,1.0,0.0,0.0,0.0,0.0
4173,M,2.0,0.0,0.0,1.0,0.0,1.0
4174,M,2.0,0.0,0.0,1.0,0.0,1.0
4175,F,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
data

array([['M'],
       ['F'],
       ['I']], dtype='<U1')

In [14]:
binary_encoder.fit_transform(data)



array([[0., 1.],
       [0., 0.],
       [1., 0.]])