In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day26-ordinal-encoding/customer.csv")

In [3]:
df.head(10)

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No
5,31,Female,Average,School,Yes
6,18,Male,Good,School,No
7,60,Female,Poor,School,Yes
8,65,Female,Average,UG,No
9,74,Male,Good,UG,Yes


## In this notebook we are encoding ordinal columns so we will only keep the ordinal features

In [4]:
df.drop(["age", "gender"], axis = 1, inplace = True)

In [5]:
df

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No
5,Average,School,Yes
6,Good,School,No
7,Poor,School,Yes
8,Average,UG,No
9,Good,UG,Yes


In [7]:
df.describe()

Unnamed: 0,review,education,purchased
count,50,50,50
unique,3,3,2
top,Poor,PG,No
freq,18,18,26


In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop("purchased", axis = 1), df["purchased"], test_size = 0.3, random_state = 0)

In [10]:
x_train.shape

(35, 2)

## If target column is a categorical data we use label encoding for that column and for the rest of the ordinal columns we use ordinal encoding

In [11]:
from sklearn.preprocessing import OrdinalEncoder


### While creating an object of ordinal encoder we pass categories as an argument, categories is a 2d list where each element of the list tell the order of priority for each of the columns

In [12]:
oe = OrdinalEncoder(categories = [['Poor', 'Average','Good'],['School','UG','PG']])

In [13]:
oe.fit(x_train)

x_train_transformed = oe.transform(x_train)

x_test_transformed = oe.transform(x_test)

### Convert the resulting numpy arrays in pandas dataframe


In [14]:
x_train_transformed = pd.DataFrame(x_train_transformed, columns = x_train.columns)

x_test_transformed = pd.DataFrame(x_test_transformed, columns = x_test.columns)

In [15]:
x_train_transformed

Unnamed: 0,review,education
0,0.0,0.0
1,0.0,2.0
2,0.0,2.0
3,2.0,1.0
4,1.0,1.0
5,0.0,1.0
6,1.0,1.0
7,1.0,1.0
8,0.0,1.0
9,2.0,2.0


In [20]:
df["education"].unique()

array(['School', 'UG', 'PG'], dtype=object)

### Now for the target column we have to do label encoding

In [21]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y_train)

y_train_transformed = le.transform(y_train)

y_test_transformed = le.transform(y_test)

In [22]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [24]:
y_train_transformed

array([1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0])

In [32]:
df["review"].value_counts()

Poor       18
Good       18
Average    14
Name: review, dtype: int64