In [5]:
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('penguins')
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female


### One hot encoding with a single column

In [6]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
transformed = ohe.fit_transform(df['island'].values.reshape(-1, 1))

print(transformed.toarray())
print(ohe.categories_)
print(ohe.get_feature_names_out(['island']))

df[ohe.categories_[0]] = transformed.toarray()
df.head(3)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
[array(['Biscoe', 'Dream', 'Torgersen'], dtype=object)]
['island_Biscoe' 'island_Dream' 'island_Torgersen']


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Biscoe,Dream,Torgersen
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0.0,0.0,1.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0.0,0.0,1.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0.0,0.0,1.0


### One hot encoding with a single column and column transformer

In [8]:
import seaborn as sns
from sklearn.compose import make_column_transformer

df = sns.load_dataset('penguins')

transformer = make_column_transformer(
    (OneHotEncoder(), ['island']), 
    remainder='passthrough'
)

transformed = transformer.fit_transform(df)
print(transformed)

trasformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
trasformed_df.head(3)

[[0.0 0.0 1.0 ... 181.0 3750.0 'Male']
 [0.0 0.0 1.0 ... 186.0 3800.0 'Female']
 [0.0 0.0 1.0 ... 195.0 3250.0 'Female']
 ...
 [1.0 0.0 0.0 ... 222.0 5750.0 'Male']
 [1.0 0.0 0.0 ... 212.0 5200.0 'Female']
 [1.0 0.0 0.0 ... 213.0 5400.0 'Male']]


Unnamed: 0,onehotencoder__island_Biscoe,onehotencoder__island_Dream,onehotencoder__island_Torgersen,remainder__species,remainder__bill_length_mm,remainder__bill_depth_mm,remainder__flipper_length_mm,remainder__body_mass_g,remainder__sex
0,0.0,0.0,1.0,Adelie,39.1,18.7,181.0,3750.0,Male
1,0.0,0.0,1.0,Adelie,39.5,17.4,186.0,3800.0,Female
2,0.0,0.0,1.0,Adelie,40.3,18.0,195.0,3250.0,Female


### One hot encoding with multiple columns and column transformer

In [9]:
import seaborn as sns
from sklearn.compose import make_column_transformer

df = sns.load_dataset('penguins')

print(df.isna().sum())
df = df.dropna()  # all NaN values must be dropped - if not NaN is assumed as one of category values (a unique value of the feature) 

transformer = make_column_transformer(
    (OneHotEncoder(), ['island', 'sex', 'species']), 
    remainder='passthrough'
)

transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
transformed_df.head(3)

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


Unnamed: 0,onehotencoder__island_Biscoe,onehotencoder__island_Dream,onehotencoder__island_Torgersen,onehotencoder__sex_Female,onehotencoder__sex_Male,onehotencoder__species_Adelie,onehotencoder__species_Chinstrap,onehotencoder__species_Gentoo,remainder__bill_length_mm,remainder__bill_depth_mm,remainder__flipper_length_mm,remainder__body_mass_g
0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,39.1,18.7,181.0,3750.0
1,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,39.5,17.4,186.0,3800.0
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,40.3,18.0,195.0,3250.0
