### Import Libraries

In [1]:
import pandas as pd

### Load Data Set

In [2]:
df = pd.read_csv('dataset/supermarket_sales.csv')

In [3]:
df.head(1)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1


# `01. Label Encoding`
- First technique which we use is label encoding. This technique assigns a unique integer to each category.
- **Example:** For the Branch column (with values like 'A', 'B', 'C'), you can encode them as 0, 1, 2.

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
label_encoder = LabelEncoder()

In [6]:
df['Branch'].value_counts()

Branch
A    340
B    332
C    328
Name: count, dtype: int64

In [7]:
df['Branch'] = label_encoder.fit_transform(df['Branch'])

In [8]:
df.head(3)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,0,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,2,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,0,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4


# `02. One-Hot Encoding:`
- This method creates a new binary column for each category.
- **Example:** For the Payment column (with values 'Ewallet', 'Cash', 'Credit card'), you'll get three new columns: Payment_Ewallet, Payment_Cash, Payment_Credit_card.
- We are using `pd.get_dummies()`

In [9]:
df = pd.get_dummies(df, columns=['Payment']) # drop_first = True parameter is used to remove original column

In [10]:
df.head(2)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,cogs,gross margin percentage,gross income,Rating,Payment_Cash,Payment_Credit card,Payment_Ewallet
0,750-67-8428,0,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,522.83,4.761905,26.1415,9.1,False,False,True
1,226-31-3081,2,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,76.4,4.761905,3.82,9.6,True,False,False


## `One Hot Encoding through OneHotEncoder`

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
encoder = OneHotEncoder(sparse_output=False, drop='first')

In [13]:
df.head(1)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,cogs,gross margin percentage,gross income,Rating,Payment_Cash,Payment_Credit card,Payment_Ewallet
0,750-67-8428,0,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,522.83,4.761905,26.1415,9.1,False,False,True


#### Fit and transform Product Line Column

In [14]:
encoded_column = encoder.fit_transform(df[['Product line']])

In [15]:
encoded_column

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.]])

In [16]:
encoded_df = pd.DataFrame(encoded_column, columns=encoder.get_feature_names_out(['Product line']))

In [17]:
encoded_df.head(2)

Unnamed: 0,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0


In [18]:
df = pd.concat([df.drop(['Product line'], axis=1), encoded_df], axis=1)

In [19]:
df.head(1)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Unit price,Quantity,Tax 5%,Total,Date,...,gross income,Rating,Payment_Cash,Payment_Credit card,Payment_Ewallet,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel
0,750-67-8428,0,Yangon,Member,Female,74.69,7,26.1415,548.9715,1/5/2019,...,26.1415,9.1,False,False,True,0.0,0.0,1.0,0.0,0.0


# `03. Ordinal Encoding`
- Use when there is an inherent order in the categories.
- **Example:** If you had an Experience column with values like "Low", "Medium", "High", you could encode them as 1, 2, 3 respectively.

In [20]:
df['Customer type'].value_counts()

Customer type
Member    501
Normal    499
Name: count, dtype: int64

In [21]:
df['Customer type'] = df['Customer type'].map({'Normal': 1, 'Member': 2})

In [22]:
df.head(3)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Unit price,Quantity,Tax 5%,Total,Date,...,gross income,Rating,Payment_Cash,Payment_Credit card,Payment_Ewallet,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel
0,750-67-8428,0,Yangon,2,Female,74.69,7,26.1415,548.9715,1/5/2019,...,26.1415,9.1,False,False,True,0.0,0.0,1.0,0.0,0.0
1,226-31-3081,2,Naypyitaw,1,Female,15.28,5,3.82,80.22,3/8/2019,...,3.82,9.6,True,False,False,0.0,0.0,0.0,0.0,0.0
2,631-41-3108,0,Yangon,1,Male,46.33,7,16.2155,340.5255,3/3/2019,...,16.2155,7.4,False,True,False,0.0,0.0,0.0,1.0,0.0


# `04. Frequency Encoding`
- Replace categories with their frequency count.
- **Example:** For City, the values 'Yangon', 'Naypyitaw', 'Mandalay' can be replaced by their occurrence count in the dataset.

In [23]:
freq_encoding = df['City'].value_counts().to_dict()

In [24]:
freq_encoding

{'Yangon': 340, 'Mandalay': 332, 'Naypyitaw': 328}

In [25]:
df['City'] = df['City'].map(freq_encoding)

In [26]:
df.head(2)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Unit price,Quantity,Tax 5%,Total,Date,...,gross income,Rating,Payment_Cash,Payment_Credit card,Payment_Ewallet,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel
0,750-67-8428,0,340,2,Female,74.69,7,26.1415,548.9715,1/5/2019,...,26.1415,9.1,False,False,True,0.0,0.0,1.0,0.0,0.0
1,226-31-3081,2,328,1,Female,15.28,5,3.82,80.22,3/8/2019,...,3.82,9.6,True,False,False,0.0,0.0,0.0,0.0,0.0


# `05. Target Encoding`
- Use the mean of the target variable (e.g., Rating) for each category in a feature.
- **Example:** Replace the Gender column ('Male', 'Female') with the average Rating for each gender.

In [27]:
target_mean = df.groupby('Gender')['Rating'].mean()

In [28]:
target_mean

Gender
Female    6.964471
Male      6.980962
Name: Rating, dtype: float64

In [29]:
df['Gender'] = df['Gender'].map(target_mean)

In [30]:
df.head(4)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Unit price,Quantity,Tax 5%,Total,Date,...,gross income,Rating,Payment_Cash,Payment_Credit card,Payment_Ewallet,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel
0,750-67-8428,0,340,2,6.964471,74.69,7,26.1415,548.9715,1/5/2019,...,26.1415,9.1,False,False,True,0.0,0.0,1.0,0.0,0.0
1,226-31-3081,2,328,1,6.964471,15.28,5,3.82,80.22,3/8/2019,...,3.82,9.6,True,False,False,0.0,0.0,0.0,0.0,0.0
2,631-41-3108,0,340,1,6.980962,46.33,7,16.2155,340.5255,3/3/2019,...,16.2155,7.4,False,True,False,0.0,0.0,0.0,1.0,0.0
3,123-19-1176,0,340,2,6.980962,58.22,8,23.288,489.048,1/27/2019,...,23.288,8.4,False,False,True,0.0,0.0,1.0,0.0,0.0
