In [1]:
from sklearn.preprocessing import OneHotEncoder
from seaborn import load_dataset

In [33]:
# load the dataset into a Pandas DataFrame
df = load_dataset("penguins")

In [34]:
backup = df.copy()

In [4]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


### Using OneHotEncoder

In [5]:
# Initialized a OneHotEncoder object
ohe = OneHotEncoder()

In [10]:
# fitted and transformed our data
transformed = ohe.fit_transform(df[['island']])

In [13]:
# returned the array version of the tranformed data using the `.toarray()` method
print(transformed.toarray())

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


We can see that each of the resulting three columns are binary values. There are three columns in the array, because there are three unique values in the Island column. The columns are returned alphabetically.

We can access the column labels using the `.categories_` attribute of the encoder:

In [15]:
# Getting one hot encoder categories
print(ohe.categories_)
print(ohe.categories_[0])

[array(['Biscoe', 'Dream', 'Torgersen'], dtype=object)]
['Biscoe' 'Dream' 'Torgersen']


I we wanted to build these columns back into the DataFrame, we could add them as separate columns:

In [16]:
df[ohe.categories_[0]] = transformed.toarray()
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  Biscoe  Dream  Torgersen  
0       3750.0    Male     0.0    0.0        1.0  
1       3800.0  Female     0.0    0.0        1.0  
2       3250.0  Female     0.0    0.0        1.0  
3          NaN     NaN     0.0    0.0        1.0  
4       3450.0  Female     0.0    0.0        1.0  


### How to use ColumnTransformer with OneHotEncoder

Sklearn comes with a helper function `make_column_transformer()` which aids in the transformations of columns. The function generates `ColumnTransformer` objects for you and handles the transformations.

This allows us to simply pass in a list of tranformations we want to do and the columns to which we want to apply them. It also handles the process of adding the data back into the original dataset.

In [17]:
# Using  make_column_transformer to One-Hot Encode
from sklearn.compose import make_column_transformer
import pandas as pd

In [18]:
# `remainder="passthrough"` parameter to specify that all other columns should be left untouched
transformer = make_column_transformer(
    (OneHotEncoder(), ['island']),
    remainder="passthrough"
)

In [19]:
transformed = transformer.fit_transform(df)

In [20]:
transformed

array([[0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
       [0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
       [0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
       ...,
       [1.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0],
       [1.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0],
       [1.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0]], dtype=object)

In [21]:
transformed_df = pd.DataFrame(
    transformed,
    columns=transformer.get_feature_names_out()
)

In [22]:
transformed_df.head()

Unnamed: 0,onehotencoder__island_Biscoe,onehotencoder__island_Dream,onehotencoder__island_Torgersen,remainder__species,remainder__bill_length_mm,remainder__bill_depth_mm,remainder__flipper_length_mm,remainder__body_mass_g,remainder__sex,remainder__Biscoe,remainder__Dream,remainder__Torgersen
0,0.0,0.0,1.0,Adelie,39.1,18.7,181.0,3750.0,Male,0.0,0.0,1.0
1,0.0,0.0,1.0,Adelie,39.5,17.4,186.0,3800.0,Female,0.0,0.0,1.0
2,0.0,0.0,1.0,Adelie,40.3,18.0,195.0,3250.0,Female,0.0,0.0,1.0
3,0.0,0.0,1.0,Adelie,,,,,,0.0,0.0,1.0
4,0.0,0.0,1.0,Adelie,36.7,19.3,193.0,3450.0,Female,0.0,0.0,1.0


### How to One-Hot Encode Multiple columns with Scikit-Learn

In [35]:
df = backup

In [36]:
features = ['island', 'sex', 'body_mass_g']
df = df[features].dropna()

In [37]:
df.head()

Unnamed: 0,island,sex,body_mass_g
0,Torgersen,Male,3750.0
1,Torgersen,Female,3800.0
2,Torgersen,Female,3250.0
4,Torgersen,Female,3450.0
5,Torgersen,Male,3650.0


In [38]:
transformer = make_column_transformer(
    (OneHotEncoder(), ['island', 'sex']),
    remainder="passthrough"
)

In [39]:
transformed = transformer.fit_transform(df)

In [42]:
transformed_df = pd.DataFrame(
    transformed,
    columns=transformer.get_feature_names_out()
)

In [43]:
print(transformed_df)

     onehotencoder__island_Biscoe  onehotencoder__island_Dream  \
0                             0.0                          0.0   
1                             0.0                          0.0   
2                             0.0                          0.0   
3                             0.0                          0.0   
4                             0.0                          0.0   
..                            ...                          ...   
328                           1.0                          0.0   
329                           1.0                          0.0   
330                           1.0                          0.0   
331                           1.0                          0.0   
332                           1.0                          0.0   

     onehotencoder__island_Torgersen  onehotencoder__sex_Female  \
0                                1.0                        0.0   
1                                1.0                        1.0   
2     