In [7]:
import pathlib
import polars as pl

In [8]:
path_data = pathlib.Path().cwd().parent / "data" 

df = pl.read_csv(path_data / 'train.csv')
test = pl.read_csv(path_data / 'test.csv')

In [9]:
# Variáveis categóricas
categorical_vars = [
    'Gender',
    'Location', 
    'Subscription_Type',
    'Last_Interaction_Type',
    'Promo_Opted_In',
]

# Variáveis numéricas contínuas
continuous_vars = [
    'Age',
    'Account_Age_Months',
    'Monthly_Spending',
    'Total_Usage_Hours',
    'Streaming_Usage',     # Porcentagem (0-99%)
    'Discount_Used',       # Porcentagem (0-99%)
    'Satisfaction_Score'   # Escala 1-10
]

# Variáveis numéricas discretas (contagens)
discrete_vars = [
    'Support_Calls',       # Número de chamadas
    'Late_Payments',       # Número de pagamentos atrasados
    'Complaint_Tickets'    # Número de tickets de reclamação
]

# Todas as variáveis numéricas (contínuas + discretas)
numeric_vars = continuous_vars + discrete_vars

# Variável alvo
target_var = 'Churn'

# Variáveis explicativas (features)
feature_vars = [col for col in df.columns if col not in ['Customer_ID', 'Churn']]

In [10]:
test.columns

['Customer_ID',
 'Age',
 'Gender',
 'Location',
 'Subscription_Type',
 'Account_Age_Months',
 'Monthly_Spending',
 'Total_Usage_Hours',
 'Support_Calls',
 'Late_Payments',
 'Streaming_Usage',
 'Discount_Used',
 'Satisfaction_Score',
 'Last_Interaction_Type',
 'Complaint_Tickets',
 'Promo_Opted_In']

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Pré-processamento
## aplicar standard scaler para variáveis numéricas
## aplicar one-hot encoding para variáveis categóricas

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_vars),
        ('cat', OneHotEncoder(drop='first'), categorical_vars)  # Evita dummy variable trap
    ],
    remainder='passthrough'  # Mantém outras colunas inalteradas
)

X_ = df.select(feature_vars).to_pandas()
X = preprocessor.fit_transform(X_)

y = df.select('Churn').to_numpy()

X_teste = test.select(feature_vars).to_pandas()
X_teste = preprocessor.transform(X_teste)

In [12]:
import pickle

with open(path_data / 'X.pickle', 'wb') as f:
    pickle.dump(X, f)

with open(path_data / 'y.pickle', 'wb') as f:
    pickle.dump(y, f)

with open(path_data / 'X_teste.pickle', 'wb') as f:
    pickle.dump(X_teste, f)