# Transformação de dados
---

In [None]:
import pandas as pd

# pipelines e transformadores
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# codificação de variáveis
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer

# normalização
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# imputação
from sklearn.impute import SimpleImputer

# modelo
from sklearn.linear_model import LinearRegression

# para facilitar os exemplos
from sklearn import set_config
set_config(transform_output = "pandas")

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CosineSineTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, max_value):
        self.max_value = max_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assuming X is a 2D array, transform the first column

        feature = X.values[:, 0]
        sin_transformed = np.sin(2 * np.pi * feature / self.max_value)
        cos_transformed = np.cos(2 * np.pi * feature / self.max_value)
        return np.column_stack((sin_transformed, cos_transformed))

In [None]:
# @title Leitura do conjunto de dados

df = pd.read_csv("https://raw.githubusercontent.com/atlantico-academy/datasets/refs/heads/main/tips.csv")

In [None]:
# @title Dicionário de dados
df_dict = pd.DataFrame([
    {
        "variavel": "total_bill",
        "descricao": "Total pago da conta em dólares.",
        "tipo": "quantitativa",
        "subtipo": "contínua",
    },
    {
        "variavel": "tip",
        "descricao": "Valor da gorjeta dada ao garçom em dólares.",
        "tipo": "quantitativa",
        "subtipo": "contínua",
    },
    {
        "variavel": "sex",
        "descricao": "Gênero do cliente (Male/Female).",
        "tipo": "qualitativa",
        "subtipo": "nominal",
    },
    {
        "variavel": "smoker",
        "descricao": "Indica se o cliente é fumante (Yes/No).",
        "tipo": "qualitativa",
        "subtipo": "nominal",
    },
    {
        "variavel": "day",
        "descricao": "Dia da semana da refeição (Thur, Fri, Sat, Sun).",
        "tipo": "qualitativa",
        "subtipo": "ciclica",
    },
    {
        "variavel": "time",
        "descricao": "Período do dia em que ocorreu a refeição (Lunch/Dinner).",
        "tipo": "qualitativa",
        "subtipo": "ordinal",
    },
    {
        "variavel": "size",
        "descricao": "Número de pessoas na mesa.",
        "tipo": "quantitativa",
        "subtipo": "discreta",
    }
])
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,total_bill,Total pago da conta em dólares.,quantitativa,contínua
1,tip,Valor da gorjeta dada ao garçom em dólares.,quantitativa,contínua
2,sex,Gênero do cliente (Male/Female).,qualitativa,nominal
3,smoker,Indica se o cliente é fumante (Yes/No).,qualitativa,nominal
4,day,"Dia da semana da refeição (Thur, Fri, Sat, Sun).",qualitativa,ciclica
5,time,Período do dia em que ocorreu a refeição (Lunc...,qualitativa,ordinal
6,size,Número de pessoas na mesa.,quantitativa,discreta


In [None]:
# definir variável alvo
target_variable = "tip"
# selecionar variáveis não utilizadas
unused_variables = [target_variable]

# selecionar variáveis qualitativas nominais
nominal_variables = (
    df_dict
    .query("tipo == 'qualitativa' and subtipo == 'nominal' and variavel != @target_variable")
    .variavel
    .to_list()
)
# selecionar variáveis qualitativas ordinais
ordinal_variables = (
    df_dict
    .query("tipo == 'qualitativa' and subtipo == 'ordinal' and variavel != @target_variable")
    .variavel
    .to_list()
)
# selecionar variáveis quantitativas discretas
discrete_variables = (
    df_dict
    .query("tipo == 'quantitativa' and subtipo == 'discreta' and variavel != @target_variable")
    .variavel
    .to_list()
)
# selecionar variáveis quantitativas contínuas
continuous_variables = (
    df_dict
    .query("tipo == 'quantitativa' and subtipo == 'contínua' and variavel != @target_variable")
    .variavel
    .to_list()
)
temporal_variavels = df_dict.query('tipo == "qualitativa" and subtipo == "ciclica"').variavel.to_list()

# Criar X e y
X = df.drop(columns=unused_variables)
y = df[target_variable]


In [None]:
# @title Preprocessameto

# variáveis ordinal
ordinal_preprocessing = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='most_frequent')),
    ("encoding", OrdinalEncoder(categories=[['Lunch', 'Dinner']])),
])

# variáveis nominais
nominal_preprocessing = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='most_frequent')),
    ("encoding", OneHotEncoder(sparse_output=False)),
])

# variáveis discretas
discrete_preprocessing = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='median')),
])

# variáveis contínuas
continuous_preprocessing = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='mean')),
])

temporal_preprocessing = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[['Sun', 'Mon', 'Tue', 'Wen', 'Thur', 'Fri', 'Sat']])),
    ('sin_cos',  CosineSineTransformer(max_value=6)),
])


preprocessing = ColumnTransformer(transformers=[
    ('temporal', temporal_preprocessing, temporal_variavels),
    ("ordinal", ordinal_preprocessing, ordinal_variables),
    ("nominal", nominal_preprocessing, nominal_variables),
    ("discrete", discrete_preprocessing, discrete_variables),
    ("continuous", continuous_preprocessing, continuous_variables),
])

final_preprocessing = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('normalization', StandardScaler())
])


In [None]:
final_preprocessing.fit(X)

In [None]:
X_transformed = final_preprocessing.transform(X)



In [None]:
model = LinearRegression()
model.fit(X_transformed, y)

model.predict(X_transformed)

array([2.68283927, 2.20975609, 3.21704383, 3.28776484, 3.76298145,
       3.80242871, 1.8802053 , 3.95253064, 2.47211666, 2.44757169,
       2.02181089, 4.77026919, 2.50799007, 3.15481916, 2.47892722,
       3.08951702, 2.2354474 , 2.77145825, 2.86228813, 3.18305849,
       2.74399938, 2.99437156, 2.56766672, 5.13635334, 2.92336646,
       3.09628885, 2.31446244, 2.2502679 , 3.10084546, 2.93395317,
       1.95384021, 3.14726687, 2.50064008, 3.39480693, 2.73078286,
       3.50497519, 2.77334633, 2.85851198, 2.99802719, 4.18562605,
       2.74785732, 2.70057367, 2.36827256, 1.96611269, 4.28483175,
       2.77892876, 3.15087944, 4.47363919, 3.92884792, 2.75532783,
       2.23610735, 2.05033431, 4.72778752, 1.99065766, 3.82791772,
       2.89221323, 4.90126217, 3.57212235, 2.01140059, 5.9718263 ,
       2.86575429, 2.25401816, 1.99063177, 3.03962073, 2.89418309,
       3.12924837, 2.63186126, 1.08542057, 2.96207199, 2.36730263,
       2.18701741, 2.87172851, 3.51262211, 3.36346423, 2.46948

In [None]:
preprocessing = ColumnTransformer(transformers=[
    ("ordinal", ordinal_preprocessing, ordinal_variables),
    ("nominal", nominal_preprocessing, nominal_variables),
    ("discrete", discrete_preprocessing, discrete_variables),
    ("continuous", continuous_preprocessing, continuous_variables),
])

In [None]:
approach = Pipeline(steps=[
    ("preprocessing", preprocessing),
    ("normalizer", StandardScaler()),
    ("model", LinearRegression()),
])

In [None]:
approach.fit(X, y)

In [None]:
approach.predict(X)

array([2.72196753, 2.23488035, 3.24132904, 3.3183574 , 3.78847743,
       3.81986003, 1.91197035, 3.96983692, 2.50338865, 2.47886412,
       2.05345798, 4.79492612, 2.53923218, 3.17278993, 2.51822534,
       3.12027472, 2.26858206, 2.79611462, 2.89490065, 3.15463219,
       2.72230508, 2.9805005 , 2.5541511 , 5.0999337 , 2.90152274,
       3.06156855, 2.29312593, 2.22898487, 3.07885391, 2.92013244,
       1.93280409, 3.1125041 , 2.48718029, 3.36786977, 2.70909957,
       3.47628073, 2.7452613 , 2.83838782, 2.96975501, 4.15636462,
       2.71979353, 2.73165536, 2.39963105, 1.99780618, 4.30186123,
       2.80994518, 3.18158603, 4.4905114 , 3.9525402 , 2.78636391,
       2.26757593, 2.08998944, 4.75247983, 2.0223307 , 3.8453278 ,
       2.92313529, 4.89127224, 3.55777004, 2.0165504 , 5.93471073,
       2.87019245, 2.25896588, 1.99579888, 3.03118151, 2.86599742,
       3.10086689, 2.61829216, 1.10573978, 2.94019603, 2.37215598,
       2.16578706, 2.85159334, 3.52455324, 3.3755196 , 2.45605