In [35]:
import pandas as pd
import category_encoders as ce

In [36]:
data = {
    "TransactionID": [1, 2, 3, 4, 5],
    "ClientID": [231, 765, 453, 231, 892],
    "Profession": ["Self-employed", "students", "Horeca", "self-employed", "finance"],
    "Bank_dep": ["009", "005", "007", "009", "003"],
    "Risk": ["High", "high", "medium", "high", "low"],
    "Number of credits": [2, 3, 5, 2, 3],
    "Revenue": [30200, 12700, 89400, 30200, 740000]
}

df = pd.DataFrame(data)
df.set_index("TransactionID", inplace = True)
df

Unnamed: 0_level_0,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,231,Self-employed,9,High,2,30200
2,765,students,5,high,3,12700
3,453,Horeca,7,medium,5,89400
4,231,self-employed,9,high,2,30200
5,892,finance,3,low,3,740000


In [37]:
# Categorical columns
categorical_cols = ["Profession", "Bank_dep", "Risk"]

In [38]:
# Get a new clean dataframe
df_obj = df.select_dtypes(include=['object']).copy()
df_obj

Unnamed: 0_level_0,Profession,Bank_dep,Risk
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Self-employed,9,High
2,students,5,high
3,Horeca,7,medium
4,self-employed,9,high
5,finance,3,low


In [39]:
#1. Dummy Encoder
df_dummies = df_obj.copy()
df_dummies_only = pd.get_dummies(df_dummies)
df_dummies = pd.merge(df_dummies, df_dummies_only, on='TransactionID')
df_dummies

Unnamed: 0_level_0,Profession,Bank_dep,Risk,Profession_Horeca,Profession_Self-employed,Profession_finance,Profession_self-employed,Profession_students,Bank_dep_003,Bank_dep_005,Bank_dep_007,Bank_dep_009,Risk_High,Risk_high,Risk_low,Risk_medium
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Self-employed,9,High,0,1,0,0,0,0,0,0,1,1,0,0,0
2,students,5,high,0,0,0,0,1,0,1,0,0,0,1,0,0
3,Horeca,7,medium,1,0,0,0,0,0,0,1,0,0,0,0,1
4,self-employed,9,high,0,0,0,1,0,0,0,0,1,0,1,0,0
5,finance,3,low,0,0,1,0,0,1,0,0,0,0,0,1,0


In [40]:
#2. Label Encoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df_label_encoder = df_obj.copy()

for col in categorical_cols:
    df_label_encoder[col + "_encode"]=encoder.fit_transform(df_label_encoder[col])
    
# df_label_encoder.drop(columns=categorical_cols, inplace = True)
df_label_encoder

Unnamed: 0_level_0,Profession,Bank_dep,Risk,Profession_encode,Bank_dep_encode,Risk_encode
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Self-employed,9,High,1,3,0
2,students,5,high,4,1,1
3,Horeca,7,medium,0,2,3
4,self-employed,9,high,3,3,1
5,finance,3,low,2,0,2


In [45]:
#3. Backward Difference Coding

df_backward_difference_coding = df_obj.copy()

encoder = ce.BackwardDifferenceEncoder(cols=categorical_cols)

df_backward_difference_coding_only = encoder.fit_transform(df_backward_difference_coding)

df_backward_difference_coding = pd.merge(df_backward_difference_coding,df_backward_difference_coding_only , on="TransactionID")
df_backward_difference_coding

Unnamed: 0_level_0,Profession,Bank_dep,Risk,intercept,Profession_0,Profession_1,Profession_2,Profession_3,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Risk_2
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Self-employed,9,High,1,-0.8,-0.6,-0.4,-0.2,-0.75,-0.5,-0.25,-0.75,-0.5,-0.25
2,students,5,high,1,0.2,-0.6,-0.4,-0.2,0.25,-0.5,-0.25,0.25,-0.5,-0.25
3,Horeca,7,medium,1,0.2,0.4,-0.4,-0.2,0.25,0.5,-0.25,0.25,0.5,-0.25
4,self-employed,9,high,1,0.2,0.4,0.6,-0.2,-0.75,-0.5,-0.25,0.25,-0.5,-0.25
5,finance,3,low,1,0.2,0.4,0.6,0.8,0.25,0.5,0.75,0.25,0.5,0.75


In [48]:
#4. BaseN

df_basen = df_obj.copy()

encoder = ce.BaseNEncoder(cols=categorical_cols)

df_basen_only = encoder.fit_transform(df_basen)
df_basen = pd.merge(df_basen, df_basen_only, on="TransactionID")
df_basen

Unnamed: 0_level_0,Profession,Bank_dep,Risk,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Risk_2
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Self-employed,9,High,0,0,1,0,0,1,0,0,1
2,students,5,high,0,1,0,0,1,0,0,1,0
3,Horeca,7,medium,0,1,1,0,1,1,0,1,1
4,self-employed,9,high,1,0,0,0,0,1,0,1,0
5,finance,3,low,1,0,1,1,0,0,1,0,0


In [50]:
#5. Binary

df_binary = df_obj.copy()

encoder = ce.BinaryEncoder(cols=categorical_cols)

df_binary_only = encoder.fit_transform(df_binary)
df_binary = pd.merge(df_binary, df_binary_only, on="TransactionID")
df_binary

Unnamed: 0_level_0,Profession,Bank_dep,Risk,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Risk_2
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Self-employed,9,High,0,0,1,0,0,1,0,0,1
2,students,5,high,0,1,0,0,1,0,0,1,0
3,Horeca,7,medium,0,1,1,0,1,1,0,1,1
4,self-employed,9,high,1,0,0,0,0,1,0,1,0
5,finance,3,low,1,0,1,1,0,0,1,0,0


In [51]:
#6. CatBoost Encoder

df_catboost_encoder = df_obj.copy()

encoder = ce.CatBoostEncoder(cols=categorical_cols)

df_catboost_encoder_only = encoder.fit_transform(df_catboost_encoder)
df_catboost_encoder = pd.merge(df_catboost_encoder, df_catboost_encoder_only, on="TransactionID")
df_catboost_encoder

TypeError: fit_transform() missing argument: y