In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("train.csv")
train.head()

In [None]:
test = pd.read_csv("test.csv")
test.head()

In [None]:
from pandas_profiling import ProfileReport

In [None]:
# profile = ProfileReport(train, title="Raw Deposit Dataset Analysis", explorative=True)
# profile.to_file("deposit-report.html")

## Drop the ID column

In [None]:
test.drop(['ID'], axis='columns', inplace=True)
train.drop(['ID'], axis='columns', inplace=True)

## One-hot encode the marital status

In [None]:
from sklearn.preprocessing import OneHotEncoder
marital_encoder = OneHotEncoder()

marital_encoder.fit(train[['marital']])
encoded_marital = marital_encoder.transform(train[['marital']])

In [None]:
encoded_marital = pd.DataFrame(encoded_marital.toarray(), columns=marital_encoder.categories_)
encoded_marital['original_value'] = train['marital']
encoded_marital

## Bin and encode the age column

In [None]:
from custom_transformers import CutTransformer

In [None]:
age_bin = CutTransformer(bins = [0, 25, 35, 45, 55, 65, 1000], as_str=True)

In [None]:
age_frame = train[['age']].copy()
age_encoded = age_bin.fit_transform(age_frame[['age']])
age_frame['age_encoded'] = age_encoded

In [None]:
age_frame

In [None]:
age_one_hot_encoder = OneHotEncoder()

In [None]:
_ = age_one_hot_encoder.fit(age_frame[['age_encoded']])

In [None]:
one_hot_encoded_ages = pd.DataFrame(
    age_one_hot_encoder.transform(age_frame[['age_encoded']]).toarray(),
    columns=age_one_hot_encoder.categories_
)

In [None]:
pd.merge(age_frame, one_hot_encoded_ages, left_index=True, right_index=True)

## Scale balance

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
balance_scaler = StandardScaler()

In [None]:
balance_scaled = train[['balance']].copy()

In [None]:
balance_scaled['balance_scaled'] = balance_scaler.fit_transform(balance_scaled[['balance']])

In [None]:
balance_scaled

## Map yes and no to integers

In [None]:
from custom_transformers import YesNoTransformer

yes_no_transformer = YesNoTransformer()

In [None]:
housing_loan_frame = train[['housing', 'loan']].copy()

In [None]:
housing_loan_frame[['housing_mapped', 'loan_mapped']] = yes_no_transformer.fit_transform(housing_loan_frame)

In [None]:
housing_loan_frame

## Managing artifacts with Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn import set_config

# One-Hot encode marital status
one_hot_encode_marital = ColumnTransformer([
    (
        'one_hot_encode_marital', # Nombre de la transformación
        OneHotEncoder(sparse=False), # Transformación a aplicar
        ["marital"] # Columnas involucradas
    )
])

# Bin and encode age
bin_and_encode_age_pipeline = Pipeline([
    ('bin_age', CutTransformer(bins = [0, 25, 35, 45, 55, 65, 1000], as_str=True)),
    ('encode_age', OneHotEncoder()),
])

bin_and_encode_age = ColumnTransformer([
    ('bin_and_encode_age', bin_and_encode_age_pipeline, ["age"])
])

# Scale balance
scale_balance = ColumnTransformer([
    (
        'scale_balance', # Nombre de la transformación
        StandardScaler(), # Transformación a aplicar
        ["balance"] # Columnas involucradas
    )
])

# Map housing and loan
map_housing_loan = ColumnTransformer([
    (
        'map_housing_loan', # Nombre de la transformación
        YesNoTransformer(), # Transformación a aplicar
        ["housing", 'loan'] # Columnas involucradas
    )
])

# Ensambla todo el pipeline
pipe = Pipeline([
    (
        'features',
        FeatureUnion([
            ('one_hot_encode_marital', one_hot_encode_marital),
            ('bin_and_encode_age', bin_and_encode_age),
            ('scale_balance', scale_balance),
            ('map_housing_loan', map_housing_loan),
        ])
    )
])

In [None]:
from sklearn import set_config

set_config(display="diagram")
pipe

In [None]:
pipe.fit(train)

pd.DataFrame(pipe.transform(train).toarray())

In [None]:
train