<a href="https://colab.research.google.com/github/challabala/AI-ML/blob/main/Label_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Label Encoding**


Label Encoding is a technique used to convert categorical (text) data into numerical data.

Machine Learning models cannot understand text, so we convert categories into

["Male", "Female", "Female", "Male"]

[1, 0, 0, 1]


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample dataset
data = {
    "City": ["Delhi", "Mumbai", "Chennai", "Kolkata", "Mumbai"]
}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# Create Label Encoder object
le = LabelEncoder()

# Fit and transform
df["City_Encoded"] = le.fit_transform(df["City"])

print("\nLabel Encoded Data:\n", df)

# To see the mapping
print("\nClass Mapping:")
for i, cls in enumerate(le.classes_):
    print(cls, "->", i)


Original Data:
       City
0    Delhi
1   Mumbai
2  Chennai
3  Kolkata
4   Mumbai

Label Encoded Data:
       City  City_Encoded
0    Delhi             1
1   Mumbai             3
2  Chennai             0
3  Kolkata             2
4   Mumbai             3

Class Mapping:
Chennai -> 0
Delhi -> 1
Kolkata -> 2
Mumbai -> 3


In [2]:
# Step-by-Step Explanation
import pandas as pd

data = {
    "Name": ["John", "Emma", "Ravi", "Sara", "Arjun"],
    "City": ["Delhi", "Mumbai", "Chennai", "Delhi", "Kolkata"],
    "Purchased": ["Yes", "No", "Yes", "No", "Yes"]
}

df = pd.DataFrame(data)
print(df)

    Name     City Purchased
0   John    Delhi       Yes
1   Emma   Mumbai        No
2   Ravi  Chennai       Yes
3   Sara    Delhi        No
4  Arjun  Kolkata       Yes


In [4]:
# Apply Label Encoding
# City
# Purchased

from sklearn.preprocessing import LabelEncoder

le_city = LabelEncoder()
le_purchase = LabelEncoder()

df["City_encoded"] = le_city.fit_transform(df["City"])
df["Purchased_encoded"] = le_purchase.fit_transform(df["Purchased"])

In [5]:
# See the new encoded dataset

print(df)


    Name     City Purchased  City_encoded  Purchased_encoded
0   John    Delhi       Yes             1                  1
1   Emma   Mumbai        No             3                  0
2   Ravi  Chennai       Yes             0                  1
3   Sara    Delhi        No             1                  0
4  Arjun  Kolkata       Yes             2                  1


In [6]:
# Check Class Mapping
print("City Mapping:", list(le_city.classes_))
print("Purchased Mapping:", list(le_purchase.classes_))


City Mapping: ['Chennai', 'Delhi', 'Kolkata', 'Mumbai']
Purchased Mapping: ['No', 'Yes']


In [7]:
# Using pandas.get_dummies()

import pandas as pd

# Sample data
df = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue']
})

print("Original DataFrame:")
print(df)

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['Color'])

print("\nAfter One-Hot Encoding:")
print(df_encoded)


Original DataFrame:
   ID  Color
0   1    Red
1   2   Blue
2   3  Green
3   4    Red
4   5   Blue

After One-Hot Encoding:
   ID  Color_Blue  Color_Green  Color_Red
0   1       False        False       True
1   2        True        False      False
2   3       False         True      False
3   4       False        False       True
4   5        True        False      False


In [8]:
# Python Example (Ordinal Encoding)
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

df = pd.DataFrame({
    'Size': ['Small', 'Medium', 'Large', 'Small']
})

encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])

df['Size_encoded'] = encoder.fit_transform(df[['Size']])
print(df)


     Size  Size_encoded
0   Small           0.0
1  Medium           1.0
2   Large           2.0
3   Small           0.0


In [9]:
# Target Encoding
!pip install category_encoders

import pandas as pd
import category_encoders as ce

df = pd.DataFrame({
    'City': ['Delhi', 'Mumbai', 'Delhi', 'Kolkata'],
    'Price': [1000000, 2000000, 1100000, 900000]
})

# Target Encoder
encoder = ce.TargetEncoder(cols=['City'])
df['City_encoded'] = encoder.fit_transform(df['City'], df['Price'])

print(df)

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0
      City    Price  City_encoded
0    Delhi  1000000  1.221630e+06
1   Mumbai  2000000  1.347581e+06
2    Delhi  1100000  1.221630e+06
3  Kolkata   900000  1.204462e+06


| Encoding Method      | Best For                             | Creates New Columns? | Keeps Category Order? | Risk                                 |
| -------------------- | ------------------------------------ | -------------------- | --------------------- | ------------------------------------ |
| **Label Encoding**   | Small number of unordered categories | No                   | ❌ No order            | Model may think numbers have meaning |
| **One-Hot Encoding** | Unordered categories, small size     | ✔ Yes                | ❌ No                  | High dimensionality                  |
| **Ordinal Encoding** | Ordered categories                   | No                   | ✔ Yes                 | Wrong if order is incorrect          |
| **Target Encoding**  | Large category sets                  | No                   | ✔ Depends             | Overfitting, data leakage            |
