In [30]:
# importing necessary module
import pandas as pd

#### 1. Using Pandas (Dummy Variable)
##### Example 1

In [12]:
one_data = pd.DataFrame({
    'Fruit': ['Apple', 'Banana', 'Orange', 'Apple', 'Orange', 'Banana'],
    'Price': [1.2, 0.5, 0.8, 1.3, 0.9, 0.6]
})


one_data

Unnamed: 0,Fruit,Price
0,Apple,1.2
1,Banana,0.5
2,Orange,0.8
3,Apple,1.3
4,Orange,0.9
5,Banana,0.6


In [None]:
# Use pd.get_dummies() to one-hot encode the categorical columns
df_pandas_encode = pd.get_dummies(one_data, columns=['Fruit'])
df_pandas_encode

Unnamed: 0,Price,Fruit_Apple,Fruit_Banana,Fruit_Orange
0,1.2,True,False,False
1,0.5,False,True,False
2,0.8,False,False,True
3,1.3,True,False,False
4,0.9,False,False,True
5,0.6,False,True,False


We can observe that we have 3 Color columns in the data. **However we can just use n-1 columns to define parameters if it has n unique labels.** For example if we only keep the  Fruit_Banana and Fruit_Orange then also we can convey the entire information as when the label is 1 in Fruit_Banana column it means Banana and when the label is 1 in Fruit_Orange it means Orange, but if Fruit_Banana and Fruit_Orange both are false then it mean Apple. This way we can encode the categorical data and reduce the number of parameters as well.
</br>
For this there is a parameter in get_dummies called drop_first. If we put the value of drop_first = True then first column of all category will disappear.

In [14]:
df_pandas_encode_rf = pd.get_dummies(one_data, columns=['Fruit'], drop_first=True)
df_pandas_encode_rf

Unnamed: 0,Price,Fruit_Banana,Fruit_Orange
0,1.2,False,False
1,0.5,True,False
2,0.8,False,True
3,1.3,False,False
4,0.9,False,True
5,0.6,True,False


In [15]:
df_pandas_encode['Fruit_Apple'].dtype

dtype('bool')

üß† 1. Technically

Machine learning models can handle True/False values,<br>
because under the hood, they‚Äôre stored as binary (0 and 1) anyway.<br>
<br>
So in most libraries (like scikit-learn, XGBoost, etc.),<br>
‚úÖ True is automatically treated as 1<br>
‚úÖ False is automatically treated as 0<br>
<br>
So ‚Äî it will work.<br>
<br>
***
<br>
‚öôÔ∏è 2. But Practically

It‚Äôs better practice to explicitly convert them to int (0/1) because:<br>

* It removes confusion when inspecting the data.

* Some functions (especially mathematical operations or matrix manipulations) expect numeric input.

* It keeps your data consistent and avoids type warnings.

So while True/False works,<br>
üëâ most data scientists prefer using 0/1.<br>
<br>
***
<br>
üí° Rule of thumb:<br>
<br>
Machine learning models accept Booleans, but prefer integers.

In [16]:
df_pandas_encode_int = pd.get_dummies(one_data, columns=['Fruit'], drop_first=True, dtype=int)
df_pandas_encode_int

Unnamed: 0,Price,Fruit_Banana,Fruit_Orange
0,1.2,0,0
1,0.5,1,0
2,0.8,0,1
3,1.3,0,0
4,0.9,0,1
5,0.6,1,0


##### Example 2.

In [17]:
one_data_2 = pd.DataFrame({
    'Employee id': [10, 20, 15, 25, 30],
    'Gender': ['M', 'F', 'F', 'M', 'F'],
    'Remarks': ['Good', 'Nice', 'Good', 'Great', 'Nice']
})
one_data_2

Unnamed: 0,Employee id,Gender,Remarks
0,10,M,Good
1,20,F,Nice
2,15,F,Good
3,25,M,Great
4,30,F,Nice


In [18]:
one_data_2_encoded = pd.get_dummies(one_data_2, columns=['Gender', 'Remarks'])
one_data_2_encoded

Unnamed: 0,Employee id,Gender_F,Gender_M,Remarks_Good,Remarks_Great,Remarks_Nice
0,10,False,True,True,False,False
1,20,True,False,False,False,True
2,15,True,False,True,False,False
3,25,False,True,False,True,False
4,30,True,False,False,False,True


In [19]:
one_data_2_encoded_rf = pd.get_dummies(one_data_2, columns=['Gender', 'Remarks'], drop_first=True)
one_data_2_encoded_rf

Unnamed: 0,Employee id,Gender_M,Remarks_Great,Remarks_Nice
0,10,True,False,False
1,20,False,False,True
2,15,False,False,False
3,25,True,True,False
4,30,False,False,True


#### 2. One Hot Encoding using Scikit Learn Library
##### Example 1

In [20]:
from sklearn.preprocessing import OneHotEncoder      # For One-Hot encoding
encoder = OneHotEncoder(sparse_output=False)    # sparse_output=False means the result will be a regular NumPy array, not a sparse matrix
categorical_columns = one_data.select_dtypes(include=['object']).columns.tolist()
categorical_columns

['Fruit']

In [21]:
one_data

Unnamed: 0,Fruit,Price
0,Apple,1.2
1,Banana,0.5
2,Orange,0.8
3,Apple,1.3
4,Orange,0.9
5,Banana,0.6


In [22]:
one_hot_encoded = encoder.fit_transform(one_data[categorical_columns])
one_hot_encoded

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [23]:

one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(categorical_columns))
one_hot_df

Unnamed: 0,Fruit_Apple,Fruit_Banana,Fruit_Orange
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,0.0,1.0,0.0


In [24]:
df_sklearn_encoded = pd.concat([one_data.drop(categorical_columns, axis=1), one_hot_df], axis=1)
df_sklearn_encoded

Unnamed: 0,Price,Fruit_Apple,Fruit_Banana,Fruit_Orange
0,1.2,1.0,0.0,0.0
1,0.5,0.0,1.0,0.0
2,0.8,0.0,0.0,1.0
3,1.3,1.0,0.0,0.0
4,0.9,0.0,0.0,1.0
5,0.6,0.0,1.0,0.0


##### Example 2

In [25]:
encoder = OneHotEncoder(sparse_output=False)    # sparse_output=False means the result will be a regular NumPy array, not a sparse matrix
categorical_columns = one_data_2.select_dtypes(include=['object']).columns.tolist()
categorical_columns

['Gender', 'Remarks']

In [26]:
one_data_2

Unnamed: 0,Employee id,Gender,Remarks
0,10,M,Good
1,20,F,Nice
2,15,F,Good
3,25,M,Great
4,30,F,Nice


In [27]:
one_hot_encoded_2 = encoder.fit_transform(one_data_2[categorical_columns])
one_hot_encoded_2

array([[0., 1., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.]])

In [28]:

one_hot_df_2 = pd.DataFrame(one_hot_encoded_2, 
                          columns=encoder.get_feature_names_out(categorical_columns))
one_hot_df_2

Unnamed: 0,Gender_F,Gender_M,Remarks_Good,Remarks_Great,Remarks_Nice
0,0.0,1.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0


In [29]:
df_sklearn_encoded_2 = pd.concat([one_data_2.drop(categorical_columns, axis=1), one_hot_df_2], axis=1)
df_sklearn_encoded_2

Unnamed: 0,Employee id,Gender_F,Gender_M,Remarks_Good,Remarks_Great,Remarks_Nice
0,10,0.0,1.0,1.0,0.0,0.0
1,20,1.0,0.0,0.0,0.0,1.0
2,15,1.0,0.0,1.0,0.0,0.0
3,25,0.0,1.0,0.0,1.0,0.0
4,30,1.0,0.0,0.0,0.0,1.0


üß† Summary Table<br>
| Step | Tool Used | Function | Output Type | Keeps Column Names | Drop First? |
|------|------------|-----------|---------------|--------------------|--------------|
| 1 | Pandas | `pd.get_dummies()` | DataFrame | ‚úÖ Yes | ‚úÖ Optional (`drop_first=True`) |
| 2 | Scikit-Learn | `OneHotEncoder()` | NumPy array | ‚ùå No (need `get_feature_names_out`) | ‚ùå Must handle manually |