In [3]:
#see https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features 
#(6.3.4. Encoding categorical features)
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import pandas as pd

df = sns.load_dataset("penguins")
le = LabelEncoder()
y_encoded = le.fit_transform(df['species'])
df['class_encoded'] = y_encoded

samples = df.groupby('species').head(1)


print("Display class (species) label and encoded classes with all features")
print()
print(samples)
print()
print("Display class (species) label and encoded classes")
# Display only class label and species
print()
print(samples[['class_encoded', 'species']])

Display class (species) label and encoded classes with all features

       species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0       Adelie  Torgersen            39.1           18.7              181.0   
152  Chinstrap      Dream            46.5           17.9              192.0   
220     Gentoo     Biscoe            46.1           13.2              211.0   

     body_mass_g     sex  class_encoded  
0         3750.0    Male              0  
152       3500.0  Female              1  
220       4500.0  Female              2  

Display class (species) label and encoded classes

     class_encoded    species
0                0     Adelie
152              1  Chinstrap
220              2     Gentoo


In [4]:
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
df = sns.load_dataset("penguins")
# One-hot encoding for species column
ohe = OneHotEncoder(sparse=False)
species_encoded = ohe.fit_transform(df[['species']])
species_encoded_df = pd.DataFrame(species_encoded, columns=ohe.get_feature_names(['species']))

# Combine the one-hot encoded species with the original DataFrame
df_encoded = pd.concat([df, species_encoded_df], axis=1)

# Group the data by class and select the first sample from each group
samples = df_encoded.groupby('species').first().reset_index()

# Display the samples
print("Display class (species) label and encoded classes with all features")
print()
print(samples)
print()
print("Display class (species) label and encoded classes")
# Display only class label and species
print()
print(samples[['species_Adelie',  'species_Chinstrap',  'species_Gentoo', 'species']])

Display class (species) label and encoded classes with all features

     species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0     Adelie  Torgersen            39.1           18.7              181.0   
1  Chinstrap      Dream            46.5           17.9              192.0   
2     Gentoo     Biscoe            46.1           13.2              211.0   

   body_mass_g     sex  species_Adelie  species_Chinstrap  species_Gentoo  
0       3750.0    Male             1.0                0.0             0.0  
1       3500.0  Female             0.0                1.0             0.0  
2       4500.0  Female             0.0                0.0             1.0  

Display class (species) label and encoded classes

   species_Adelie  species_Chinstrap  species_Gentoo    species
0             1.0                0.0             0.0     Adelie
1             0.0                1.0             0.0  Chinstrap
2             0.0                0.0             1.0     Gentoo




In [12]:
# Also it is possible to use pd.factorize
y = df['species']


# Convert categorical values to numerical labels
y_labels, y_unique = pd.factorize(y)

df['y_labels'] = y_labels
samples = df.groupby('species').first().reset_index()

# Display the samples
print("Display class (species) label and encoded classes with all features")
print()
print(samples)
print()



Display class (species) label and encoded classes with all features

     species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0     Adelie  Torgersen            39.1           18.7              181.0   
1  Chinstrap      Dream            46.5           17.9              192.0   
2     Gentoo     Biscoe            46.1           13.2              211.0   

   body_mass_g     sex  y_labels  
0       3750.0    Male         0  
1       3500.0  Female         1  
2       4500.0  Female         2  

