In [None]:
# https://towardsdatascience.com/robust-one-hot-encoding-930b5f8943af

In [1]:
import pandas as pd

# Creating the training_data DataFrame in Python
training_data = pd.DataFrame({
    'numerical_1': [1, 2, 3, 4, 5, 6, 7, 8],
    'color_1_': ['black', 'black', 'red', 'green',
                'green', 'black', 'red', 'blue'],
    'color_2_': ['black', 'blue', 'pink', 'purple',
                'black', 'blue', 'pink', 'purple']
})

# Displaying the training_data DataFrame
print(training_data)
#

   numerical_1 color_1_ color_2_
0            1    black    black
1            2    black     blue
2            3      red     pink
3            4    green   purple
4            5    green    black
5            6    black     blue
6            7      red     pink
7            8     blue   purple


In [2]:
# Creating the inference_data DataFrame in Python
inference_data = pd.DataFrame({
    'numerical_1': [11, 12, 13, 14, 15, 16, 17, 18],
    'color_1_': ['black', 'blue', 'black', 'green',
                'green', 'black', 'black', 'blue'],
    'color_2_': ['orange', 'orange', 'black', 'orange',
                'black', 'orange', 'orange', 'orange']
})
print(inference_data)

   numerical_1 color_1_ color_2_
0           11    black   orange
1           12     blue   orange
2           13    black    black
3           14    green   orange
4           15    green    black
5           16    black   orange
6           17    black   orange
7           18     blue   orange


In [3]:
# Converting categorical columns in inference_data to
# Dummy variables with integers
inference_data_dummies = pd.get_dummies(inference_data,
  columns=['color_1_', 'color_2_']).astype(int)

In [4]:
inference_data_dummies

Unnamed: 0,numerical_1,color_1__black,color_1__blue,color_1__green,color_2__black,color_2__orange
0,11,1,0,0,0,1
1,12,0,1,0,0,1
2,13,1,0,0,1,0
3,14,0,0,1,0,1
4,15,0,0,1,1,0
5,16,1,0,0,0,1
6,17,1,0,0,0,1
7,18,0,1,0,0,1


In [5]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the encoder
enc = OneHotEncoder(handle_unknown='ignore')

# Define columns to transform
trans_columns = ['color_1_', 'color_2_']

# Fit and transform the data
enc_data = enc.fit_transform(training_data[trans_columns])

# Get feature names
feature_names = enc.get_feature_names_out(trans_columns)

# Convert to DataFrame
enc_df = pd.DataFrame(enc_data.toarray(),
                          columns=feature_names)

# Concatenate with the numerical data
final_df = pd.concat([training_data[['numerical_1']],
                      enc_df], axis=1)

In [6]:
final_df

Unnamed: 0,numerical_1,color_1__black,color_1__blue,color_1__green,color_1__red,color_2__black,color_2__blue,color_2__pink,color_2__purple
0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,7,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
# Transform inference data
inference_encoded = enc.transform(inference_data[trans_columns])

inference_feature_names = enc.get_feature_names_out(trans_columns)

inference_encoded_df = pd.DataFrame(inference_encoded.toarray(),
                                    columns=inference_feature_names)

final_inference_df = pd.concat([inference_data[['numerical_1']],
                                inference_encoded_df], axis=1)

In [9]:
final_inference_df

Unnamed: 0,numerical_1,color_1__black,color_1__blue,color_1__green,color_1__red,color_2__black,color_2__blue,color_2__pink,color_2__purple
0,11,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,14,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,15,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,16,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,17,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,18,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
