# Replacing categories with ordinal numbers

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("processing_data.csv")

In [3]:
X_train,X_test,Y_train,Y_test = train_test_split(data.drop("target" , axis=1),data["target"],test_size=0.2,random_state=24)

In [4]:
ordinal_mapping = {k: i for i, k in enumerate(X_train["A7"].unique(), 0)}

ordinal_mapping

{'h': 0,
 'v': 1,
 'ff': 2,
 'bb': 3,
 'dd': 4,
 'z': 5,
 'n': 6,
 'Missing': 7,
 'j': 8,
 'o': 9}

In [5]:
X_train["A7"] = X_train["A7"].map(ordinal_mapping)

X_test["A7"] = X_test["A7"].map(ordinal_mapping)

X_train["A7"]

255    0
37     0
562    0
411    1
589    1
      ..
145    0
401    1
343    3
192    1
418    1
Name: A7, Length: 552, dtype: int64

In [10]:
#using sklearn
from sklearn.preprocessing import OrdinalEncoder

from sklearn.compose import ColumnTransformer

enc = OrdinalEncoder()


vars_categorical = X_train.select_dtypes(include="O").columns.to_list()

vars_remainders = X_train.select_dtypes(exclude="O").columns.to_list()

vars_remainders

['A2', 'A3', 'A7', 'A8', 'A11', 'A14', 'A15']

In [18]:
ct  = ColumnTransformer([("encoder" , enc , vars_categorical)], remainder="passthrough")

ct.fit(X_train)

X_train_enc = ct.transform(X_train)
X_test_enc =ct.transform(X_test)

In [23]:
X_train_enc = pd.DataFrame(X_train_enc, columns=vars_categorical+vars_remainders)

X_test_enc = pd.DataFrame(X_test_enc, columns=vars_categorical+vars_remainders)

X_train_enc

Unnamed: 0,A1,A4,A5,A6,A9,A10,A12,A13,A2,A3,A7,A8,A11,A14,A15
0,2.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,18.17,10.250,0.0,1.085,0.0,320.0,13.0
1,1.0,2.0,1.0,14.0,1.0,1.0,1.0,0.0,23.00,11.750,0.0,0.500,2.0,300.0,551.0
2,2.0,2.0,1.0,2.0,1.0,0.0,0.0,0.0,23.33,1.500,0.0,1.415,0.0,422.0,200.0
3,2.0,2.0,1.0,2.0,0.0,1.0,0.0,0.0,25.17,3.000,1.0,1.250,1.0,0.0,22.0
4,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,25.33,0.580,1.0,0.290,7.0,96.0,5124.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,2.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,32.83,2.500,0.0,2.750,6.0,160.0,2072.0
548,2.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,28.92,0.375,1.0,0.290,0.0,220.0,140.0
549,2.0,2.0,1.0,7.0,0.0,0.0,0.0,0.0,33.75,2.750,3.0,0.000,0.0,180.0,0.0
550,2.0,2.0,1.0,14.0,1.0,0.0,0.0,0.0,41.75,0.960,1.0,2.500,0.0,510.0,600.0


In [24]:
#Uisng feature-engine
from feature_engine.encoding import OrdinalEncoder


In [26]:
#Let’s set up the encoder so that it replaces categories with arbitrary integers in the categorical variables

enc = OrdinalEncoder(encoding_method="arbitrary", variables=vars_categorical)

enc

In [30]:
enc.fit(X_train)

print(enc.encoder_dict_)

X_train_enc = enc.transform(X_train)

X_test_enc = enc.transform(X_test)

X_train_enc

{'A1': {'b': 0, 'a': 1, 'Missing': 2}, 'A4': {'u': 0, 'y': 1, 'Missing': 2, 'l': 3}, 'A5': {'g': 0, 'p': 1, 'Missing': 2, 'gg': 3}, 'A6': {'c': 0, 'x': 1, 'k': 2, 'ff': 3, 'aa': 4, 'i': 5, 'w': 6, 'cc': 7, 'd': 8, 'q': 9, 'e': 10, 'j': 11, 'm': 12, 'r': 13, 'Missing': 14}, 'A9': {'f': 0, 't': 1}, 'A10': {'f': 0, 't': 1}, 'A12': {'f': 0, 't': 1}, 'A13': {'g': 0, 's': 1, 'p': 2}}


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
255,0,18.17,10.250,0,0,0,0,1.085,0,0,0,0,0,320.0,13
37,1,23.00,11.750,0,0,1,0,0.500,1,1,2,1,0,300.0,551
562,0,23.33,1.500,0,0,0,0,1.415,1,0,0,0,0,422.0,200
411,0,25.17,3.000,0,0,0,1,1.250,0,1,1,0,0,0.0,22
589,0,25.33,0.580,0,0,0,1,0.290,1,1,7,1,0,96.0,5124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0,32.83,2.500,0,0,7,0,2.750,1,1,6,0,0,160.0,2072
401,0,28.92,0.375,0,0,0,1,0.290,0,0,0,0,0,220.0,140
343,0,33.75,2.750,0,0,5,3,0.000,0,0,0,0,0,180.0,0
192,0,41.75,0.960,0,0,1,1,2.500,1,0,0,0,0,510.0,600
