In [5]:
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

In [10]:
df = pd.read_csv("data/train.csv", sep = ";")
df = df.iloc[:, 0:8]

df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan
0,58,management,married,tertiary,no,2143,yes,no
1,44,technician,single,secondary,no,29,yes,no
2,33,entrepreneur,married,secondary,no,2,yes,yes
3,47,blue-collar,married,unknown,no,1506,yes,no
4,33,unknown,single,unknown,no,1,no,no
...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no
45207,71,retired,divorced,primary,no,1729,no,no
45208,72,retired,married,secondary,no,5715,no,no
45209,57,blue-collar,married,secondary,no,668,no,no


In [9]:
categorical_transformer_onehot = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False))
    ]
)

categorical_transformer_ordinal = Pipeline(
    steps=[
        ("encoder", OrdinalEncoder())
    ]
)

num = Pipeline(
    steps=[
        ("encoder", PowerTransformer())
    ]
)


preprocessor  = ColumnTransformer(
    transformers = [
        ('cat_onehot', categorical_transformer_onehot, ["default", "housing", "loan", "job", "marital"]),
        ('cat_ordinal', categorical_transformer_ordinal, ["education"]),
        ('num', num, ["age", "balance"])
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor)
    ]
)
pipe_fit = pipeline.fit(df)

data = pd.DataFrame(pipe_fit.transform(df), columns=pipe_fit.get_feature_names_out().tolist())

data

Unnamed: 0,cat_onehot__default_yes,cat_onehot__housing_yes,cat_onehot__loan_yes,cat_onehot__job_blue-collar,cat_onehot__job_entrepreneur,cat_onehot__job_housemaid,cat_onehot__job_management,cat_onehot__job_retired,cat_onehot__job_self-employed,cat_onehot__job_services,cat_onehot__job_student,cat_onehot__job_technician,cat_onehot__job_unemployed,cat_onehot__job_unknown,cat_onehot__marital_married,cat_onehot__marital_single,cat_ordinal__education,num__age,num__balance
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.473637,0.414773
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.429379,-0.410774
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.709873,-0.431122
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.683128,0.197685
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3.0,-0.709873,-0.432119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.993503,-0.052398
45207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.207438,0.275122
45208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.257237,1.495765
45209,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.409327,-0.114235


In [19]:
data.to_parquet('data/df_preprocessed.parquet', index=False)