# Import Libraries

In [1]:
import pandas as pd              # for Pandas
import numpy as np               # for NumpPy

In [2]:
df = pd.read_excel("EncodingData.xlsx")
df.head(2)

Unnamed: 0,ID,Country,City,Education,Grade,Job Title,Salary,Experince,Gender
0,1,UK,London,High School,A,Data Scientist,85000,1,Male
1,2,UK,London,Bachelor,A,Data Scientist,82000,2,Male


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         15 non-null     int64 
 1   Country    15 non-null     object
 2   City       15 non-null     object
 3   Education  15 non-null     object
 4   Grade      15 non-null     object
 5   Job Title  15 non-null     object
 6   Salary     15 non-null     int64 
 7   Experince  15 non-null     int64 
 8   Gender     15 non-null     object
dtypes: int64(3), object(6)
memory usage: 1.2+ KB


# Unique Values

In [4]:
# To get detailed information about the data set;

def get_unique_values(df):
    
    output_data = []

    for col in df.columns:

        # If the number of unique values in the column is less than or equal to 7
        if df.loc[:, col].nunique() <= 7:
            # Get the unique values in the column
            unique_values = df.loc[:, col].unique()
            # Append the column name, number of unique values, unique values, and data type to the output data
            output_data.append([col, df.loc[:, col].nunique(), unique_values, df.loc[:, col].dtype])
        else:
            # Otherwise, append only the column name, number of unique values, and data type to the output data
            output_data.append([col, df.loc[:, col].nunique(),"-", df.loc[:, col].dtype])

    output_df = pd.DataFrame(output_data, columns=['Column Name', 'Number of Unique Values', ' Unique Values ', 'Data Type'])

    return output_df

get_unique_values(df)

Unnamed: 0,Column Name,Number of Unique Values,Unique Values,Data Type
0,ID,15,-,int64
1,Country,3,"[UK, USA, Germany]",object
2,City,7,"[London, New York, Glasgow, Berlin, Bonn, Live...",object
3,Education,4,"[High School, Bachelor, Master, PhD]",object
4,Grade,3,"[A, B, C]",object
5,Job Title,3,"[Data Scientist, Data Analyst, Python Developer]",object
6,Salary,13,-,int64
7,Experince,7,"[1, 2, 5, 9, 8, 15, 4]",int64
8,Gender,2,"[Male, Female]",object


# Label Encoding


**Label Encoding is typically used in the following scenarios:**

1. If there are limited number of categories and there isn't a ranking among them. (e.g. days of week, colours, ...)

2. Binary Classification: When the categorical variable has only two classes, label encoding can be a straightforward method to convert these classes into numerical values (e.g., "Yes"/"No" or "Male"/"Female").


**Caution**

**High Cardinality Categorical Variables:** Label encoding might not be suitable for features with many unique categories as it can introduce unintended ordinal relationships.

In [5]:
# Import "LabelEncoder" from sklearn
from sklearn.preprocessing import LabelEncoder

In [6]:
# Initialize the label encoder
label_encoder = LabelEncoder()

In [7]:
# Fit and transform the data
df["encoded_country"] = label_encoder.fit_transform(df["Country"])
df.head(5) 

Unnamed: 0,ID,Country,City,Education,Grade,Job Title,Salary,Experince,Gender,encoded_country
0,1,UK,London,High School,A,Data Scientist,85000,1,Male,1
1,2,UK,London,Bachelor,A,Data Scientist,82000,2,Male,1
2,3,USA,New York,Master,A,Data Scientist,75000,5,Female,2
3,4,USA,New York,PhD,B,Data Analyst,68000,9,Female,2
4,5,UK,Glasgow,PhD,C,Data Analyst,65000,5,Female,1


In [8]:
# Let's Apply Label Encoding on City;

# Fit and transform the data
df["encoded_city"] = label_encoder.fit_transform(df["City"])
df.head(15) 

Unnamed: 0,ID,Country,City,Education,Grade,Job Title,Salary,Experince,Gender,encoded_country,encoded_city
0,1,UK,London,High School,A,Data Scientist,85000,1,Male,1,4
1,2,UK,London,Bachelor,A,Data Scientist,82000,2,Male,1,4
2,3,USA,New York,Master,A,Data Scientist,75000,5,Female,2,5
3,4,USA,New York,PhD,B,Data Analyst,68000,9,Female,2,5
4,5,UK,Glasgow,PhD,C,Data Analyst,65000,5,Female,1,2
5,6,UK,Glasgow,Master,B,Data Analyst,89000,2,Female,1,2
6,7,Germany,Berlin,Master,B,Data Scientist,78000,5,Male,0,0
7,8,Germany,Bonn,Master,C,Data Scientist,98000,8,Male,0,1
8,9,USA,New York,Bachelor,C,Python Developer,79000,9,Male,2,5
9,10,USA,New York,Bachelor,C,Python Developer,98000,15,Male,2,5


**NOTE:** When applying Label Encoding, lower numbers are given to the words that come first alphabetically.

# One Hot Encoding

**One-Hot Encoding offers several advantages over Label Encoding:**

**1. Prevents Ordinal Relationships:**

**One-Hot Encoding:** Represents each category as an independent binary column, preventing the model from inferring any ordinal relationship between categories.
    
**Label Encoding:** Assigns numerical values to categories, which can incorrectly imply an ordinal relationship (e.g., 'Red' = 1, 'Green' = 2, 'Blue' = 3).


**2. Better Performance with Certain Models:**

**One-Hot Encoding:** Works well with linear models and distance-based algorithms (e.g., k-Nearest Neighbors) because it treats all categories as equally distant.

**Label Encoding:** Can degrade performance in these models due to the implied ordinal relationships.


**3. Model Stability:**

**One-Hot Encoding:** More stable when new categories are introduced, as each category is treated independently.

**Label Encoding:** New categories can alter the numerical relationships, affecting model performance.


**4. Interpretability:**

**One-Hot Encoding:** Easier to interpret model coefficients, as each category is a separate feature.

**Label Encoding:** Harder to interpret, as numerical values may not directly reflect category differences.



**NOTE: If the model encounters a category in the test that is not encountered in the train data, while "Label Encoding" gives "Error", "One Hot Encoding" works.**





**The number of new columns equal to number of categories at OneHot Encoding.**

In [9]:
Encoded_Job_Title = pd.get_dummies(df['Job Title'], prefix="Job_Title")

# Concat "new encodedcolumns" to df
df = pd.concat([df,Encoded_Job_Title], axis=1)
df.head(5)

Unnamed: 0,ID,Country,City,Education,Grade,Job Title,Salary,Experince,Gender,encoded_country,encoded_city,Job_Title_Data Analyst,Job_Title_Data Scientist,Job_Title_Python Developer
0,1,UK,London,High School,A,Data Scientist,85000,1,Male,1,4,False,True,False
1,2,UK,London,Bachelor,A,Data Scientist,82000,2,Male,1,4,False,True,False
2,3,USA,New York,Master,A,Data Scientist,75000,5,Female,2,5,False,True,False
3,4,USA,New York,PhD,B,Data Analyst,68000,9,Female,2,5,True,False,False
4,5,UK,Glasgow,PhD,C,Data Analyst,65000,5,Female,1,2,True,False,False


# Dummy Encoding


If number of categories **n** in a column, we crerate **n-1** new columns.

In [10]:
# We generally drop first column for decrease number of features and cost of model;
# This protects us from "DUMMY VARIABLE TRAP", too.

# Read data again
df = pd.read_excel("EncodingData.xlsx")
df.head(2)

Unnamed: 0,ID,Country,City,Education,Grade,Job Title,Salary,Experince,Gender
0,1,UK,London,High School,A,Data Scientist,85000,1,Male
1,2,UK,London,Bachelor,A,Data Scientist,82000,2,Male


In [11]:
# Let's apply Dummy Encoding to "Job Title";

Encoded_Job_Title = pd.get_dummies(df['Job Title'], prefix="Job_Title", drop_first=True)

# Concat "new encoded columns" to df
df = pd.concat([df,Encoded_Job_Title], axis=1)
df.head(5)

Unnamed: 0,ID,Country,City,Education,Grade,Job Title,Salary,Experince,Gender,Job_Title_Data Scientist,Job_Title_Python Developer
0,1,UK,London,High School,A,Data Scientist,85000,1,Male,True,False
1,2,UK,London,Bachelor,A,Data Scientist,82000,2,Male,True,False
2,3,USA,New York,Master,A,Data Scientist,75000,5,Female,True,False
3,4,USA,New York,PhD,B,Data Analyst,68000,9,Female,False,False
4,5,UK,Glasgow,PhD,C,Data Analyst,65000,5,Female,False,False


In [12]:
# Also, we can drop original data (Job Title) from df

df.drop(columns=["Job Title"], inplace=True)
df.head(5)

Unnamed: 0,ID,Country,City,Education,Grade,Salary,Experince,Gender,Job_Title_Data Scientist,Job_Title_Python Developer
0,1,UK,London,High School,A,85000,1,Male,True,False
1,2,UK,London,Bachelor,A,82000,2,Male,True,False
2,3,USA,New York,Master,A,75000,5,Female,True,False
3,4,USA,New York,PhD,B,68000,9,Female,False,False
4,5,UK,Glasgow,PhD,C,65000,5,Female,False,False


# Ordinal Encoding

**Ordinal Encoding should be used when the categorical data has a clear, meaningful order or ranking**. Here are the scenarios where Ordinal Encoding is appropriate:


1. **Inherent Order:** When the categorical variable represents a set of categories with a logical order. For example:

   * Ratings (e.g., 'Poor', 'Average', 'Good', 'Excellent')
   * Education levels (e.g., 'High School', 'Bachelor's', 'Master's', 'PhD')
   * Sizes (e.g., 'Small', 'Medium', 'Large', 'Extra Large')
   

2. **Ordinal Relationships:** When the model needs to capture the ordinal relationship between categories. This is useful when the relative positioning of the categories carries important information for the model.


3. **Ordered Factor Levels in Statistical Models:** When using statistical models that benefit from ordinal factor levels, such as ordinal logistic regression.


4. **Reduced Dimensionality:** When you want to reduce the dimensionality of categorical variables while preserving their order, as opposed to One-Hot Encoding which increases the dimensionality.

In [13]:
# Let's apply "Ordinal Encoding" to "Education"

# Import related library
from sklearn.preprocessing import OrdinalEncoder


# Initialize the ordinal encoder with """!!! specified order (ascending)!!! """
encoder = OrdinalEncoder(categories=[['High School', 'Bachelor', 'Master', 'PhD']])


# Fit and transform the data
df['Education_Encoded'] = encoder.fit_transform(df[['Education']])

df.head(5)

Unnamed: 0,ID,Country,City,Education,Grade,Salary,Experince,Gender,Job_Title_Data Scientist,Job_Title_Python Developer,Education_Encoded
0,1,UK,London,High School,A,85000,1,Male,True,False,0.0
1,2,UK,London,Bachelor,A,82000,2,Male,True,False,1.0
2,3,USA,New York,Master,A,75000,5,Female,True,False,2.0
3,4,USA,New York,PhD,B,68000,9,Female,False,False,3.0
4,5,UK,Glasgow,PhD,C,65000,5,Female,False,False,3.0


In [14]:
# Let's apply "Ordinal Encoding" to "Grade"

# Import related library
from sklearn.preprocessing import OrdinalEncoder


# Initialize the ordinal encoder with """!!! specified order (ascending)!!! """
encoder = OrdinalEncoder(categories=[['C', 'B', 'A']]) # We wrote them in inverse order!!! Because A is highest grade, C is lowest grade.


# Fit and transform the data
df['Grade_Encoded'] = encoder.fit_transform(df[['Grade']])

df.head(5)

Unnamed: 0,ID,Country,City,Education,Grade,Salary,Experince,Gender,Job_Title_Data Scientist,Job_Title_Python Developer,Education_Encoded,Grade_Encoded
0,1,UK,London,High School,A,85000,1,Male,True,False,0.0,2.0
1,2,UK,London,Bachelor,A,82000,2,Male,True,False,1.0,2.0
2,3,USA,New York,Master,A,75000,5,Female,True,False,2.0,2.0
3,4,USA,New York,PhD,B,68000,9,Female,False,False,3.0,1.0
4,5,UK,Glasgow,PhD,C,65000,5,Female,False,False,3.0,0.0


In [15]:
# Transform float to int 
df['Grade_Encoded'] = df['Grade_Encoded'].astype(int)
df.head(5)

Unnamed: 0,ID,Country,City,Education,Grade,Salary,Experince,Gender,Job_Title_Data Scientist,Job_Title_Python Developer,Education_Encoded,Grade_Encoded
0,1,UK,London,High School,A,85000,1,Male,True,False,0.0,2
1,2,UK,London,Bachelor,A,82000,2,Male,True,False,1.0,2
2,3,USA,New York,Master,A,75000,5,Female,True,False,2.0,2
3,4,USA,New York,PhD,B,68000,9,Female,False,False,3.0,1
4,5,UK,Glasgow,PhD,C,65000,5,Female,False,False,3.0,0


In [16]:
# Also, we can drop original data (Grade) from df

df.drop(columns=["Grade"], inplace=True)
df.head(5)

Unnamed: 0,ID,Country,City,Education,Salary,Experince,Gender,Job_Title_Data Scientist,Job_Title_Python Developer,Education_Encoded,Grade_Encoded
0,1,UK,London,High School,85000,1,Male,True,False,0.0,2
1,2,UK,London,Bachelor,82000,2,Male,True,False,1.0,2
2,3,USA,New York,Master,75000,5,Female,True,False,2.0,2
3,4,USA,New York,PhD,68000,9,Female,False,False,3.0,1
4,5,UK,Glasgow,PhD,65000,5,Female,False,False,3.0,0
