In [2]:
# importing necessary module
import pandas as pd
from sklearn.preprocessing import OneHotEncoder      # For One-Hot encoding

#### 1. Using Pandas

In [3]:
one_data = pd.DataFrame({
    'Fruit': ['Apple', 'Banana', 'Orange', 'Apple', 'Orange', 'Banana'],
    'Price': [1.2, 0.5, 0.8, 1.3, 0.9, 0.6]
})


one_data

Unnamed: 0,Fruit,Price
0,Apple,1.2
1,Banana,0.5
2,Orange,0.8
3,Apple,1.3
4,Orange,0.9
5,Banana,0.6


In [5]:
# Use pd.get_dummies() to one-hot encode the categorical columns
df_pandas_encode = pd.get_dummies(one_data, columns=['Fruit'])
df_pandas_encode

Unnamed: 0,Price,Fruit_Apple,Fruit_Banana,Fruit_Orange
0,1.2,True,False,False
1,0.5,False,True,False
2,0.8,False,False,True
3,1.3,True,False,False
4,0.9,False,False,True
5,0.6,False,True,False


We can observe that we have 3 Color columns in the data. **However we can just use n-1 columns to define parameters if it has n unique labels.** For example if we only keep the  Fruit_Banana and Fruit_Orange then also we can convey the entire information as when the label is 1 in Fruit_Banana column it means Banana and when the label is 1 in Fruit_Orange it means Orange, but if Fruit_Banana and Fruit_Orange both are false then it mean Apple. This way we can encode the categorical data and reduce the number of parameters as well.
</br>
For this there is a parameter in get_dummies called drop_first. If we put the value of drop_first = True then first column of all category will disappear.

In [8]:
df_pandas_encode_rf = pd.get_dummies(one_data, columns=['Fruit'], drop_first=True)
df_pandas_encode_rf

Unnamed: 0,Price,Fruit_Banana,Fruit_Orange
0,1.2,False,False
1,0.5,True,False
2,0.8,False,True
3,1.3,False,False
4,0.9,False,True
5,0.6,True,False


##### 2. One Hot Encoding using Scikit Learn Library

In [14]:
encoder = OneHotEncoder(sparse_output=False) 
categorical_columns = one_data.select_dtypes(include=['object']).columns.tolist()
categorical_columns

['Fruit']

In [16]:
one_data

Unnamed: 0,Fruit,Price
0,Apple,1.2
1,Banana,0.5
2,Orange,0.8
3,Apple,1.3
4,Orange,0.9
5,Banana,0.6


In [15]:
one_hot_encoded = encoder.fit_transform(one_data[categorical_columns])
one_hot_encoded

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [18]:

one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(categorical_columns))
one_hot_df

Unnamed: 0,Fruit_Apple,Fruit_Banana,Fruit_Orange
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,0.0,1.0,0.0


In [19]:

df_sklearn_encoded = pd.concat([one_data.drop(categorical_columns, axis=1), one_hot_df], axis=1)
df_sklearn_encoded

Unnamed: 0,Price,Fruit_Apple,Fruit_Banana,Fruit_Orange
0,1.2,1.0,0.0,0.0
1,0.5,0.0,1.0,0.0
2,0.8,0.0,0.0,1.0
3,1.3,1.0,0.0,0.0
4,0.9,0.0,0.0,1.0
5,0.6,0.0,1.0,0.0


üß† Summary Table<br>
| Step | Tool Used | Function | Output Type | Keeps Column Names | Drop First? |
|------|------------|-----------|---------------|--------------------|--------------|
| 1 | Pandas | `pd.get_dummies()` | DataFrame | ‚úÖ Yes | ‚úÖ Optional (`drop_first=True`) |
| 2 | Scikit-Learn | `OneHotEncoder()` | NumPy array | ‚ùå No (need `get_feature_names_out`) | ‚ùå Must handle manually |