# Scripts de pré-processamento

In [104]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, RobustScaler, MaxAbsScaler, PowerTransformer

In [105]:
def hybrid_balancing(X: pd.DataFrame, y: pd.Series, tomek: str='majority', smote: str='not majority') -> pd.DataFrame:
    """
    Retorna um DataFrame com as classes balanceadas das seguinte forma:
    1. Reduz a classe majoritária usando liagacoes de TOMEK (remover instancias que nao adicionam muita informacao).
    2. Equaliza as demais classes usando SMOTE para ficarem com a mesma quantidade de instancias que a classe majoritaria.

    :param X: DataFrame com as variaveis independentes
    :X type: pd.DataFrame
    :param y: Series com a variavel dependente
    :y type: pd.Series
    :param tomek: tipo de undersampling a ser feito pelo TOMEK
    :tomek type: str
    :param smote: tipo de oversampling a ser feito pelo SMOTE
    :smote type: str
    :return: DataFrame transformado
    :rtype: pd.Dataframe
    """

    tl = TomekLinks(sampling_strategy=tomek)
    X_tl, y_tl = tl.fit_resample(X, y)

    sm = SMOTE(sampling_strategy=smote)
    X_tl_sm, y_tl_sm = sm.fit_resample(X_tl, y_tl)

    df_tl_sm = pd.concat([X_tl_sm, y_tl_sm], axis=1)
    
    return df_tl_sm

In [106]:
def scaling(num_vars: pd.DataFrame, scaler: str='minmax') -> pd.DataFrame:
    """
    metodo que detecta qual o melhor scaler a ser usado.

    :param df: DataFrame alvo das transformacoes
    :df type: pd.DataFrame
    :param cols: Lista de colunas que devem ser transformadas
    :cols type: list
    :return: DataFrame transformado
    :rtype: pd.Dataframe
    """

    if scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'normalizer':
        scaler = Normalizer()
    elif scaler == 'robust':
        scaler = RobustScaler()
    else:
        raise ValueError(f"Scaler '{scaler}' não é reconhecido. Use 'minmax', 'standard', 'normalizer', ou 'robust'.")
    
    return pd.DataFrame(scaler.fit_transform(num_vars), columns=num_vars.columns)


In [107]:
def one_hot_encoding(cat_vars: pd.DataFrame, cat_cols: list) -> pd.DataFrame:
    """
    Executa o One Hot Encoding em cada coluna fornecida de um DataFrame.

    :param df: DataFrame alvo das transformacoes
    :df type: pd.DataFrame
    :param cols: Lista de colunas que devem ser transformadas
    :cols type: list
    :return: DataFrame transformado
    :rtype: pd.Dataframe
    """

    one_hot_encoder = OneHotEncoder(sparse_output=False)
    return pd.DataFrame(one_hot_encoder.fit_transform(cat_vars), columns=one_hot_encoder.get_feature_names_out(cat_cols))

In [108]:
def remove_cols(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    return df.drop(columns=cols)

In [109]:
def preprocess_data(df: pd.DataFrame, num_cols: list, cat_cols: list, scaler='minmax') -> pd.DataFrame:
    #df = remove_cols()
    
    X = df.drop(columns=['Target'])
    y = df['Target']

    df = hybrid_balancing(X, y)

    num_vars = df[num_cols]
    cat_vars = df[cat_cols]
    
    num_vars_scaled = scaling(num_vars, scaler)

    cat_vars_encoded = one_hot_encoding(cat_vars, cat_cols)

    df_balanced_scaled_encoded = pd.concat([num_vars_scaled, cat_vars_encoded], axis=1)

    return df_balanced_scaled_encoded

## Aplicação

In [110]:
df = pd.read_csv('../data/students.csv')

In [111]:
cat_cols = [
    'Marital status', 
    'Application mode', 
    'Course',
    'Daytime/evening attendance', 
    'Previous qualification', 
    'Nacionality',
    'Mother\'s qualification', 
    'Father\'s qualification',
    'Mother\'s occupation', 
    'Father\'s occupation', 
    'Displaced',
    'Educational special needs', 
    'Debtor', 
    'Tuition fees up to date',
    'Gender', 
    'Scholarship holder', 
    'International',
    'Application order'
    ]            

num_cols = [
    'Age at enrollment',
    'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Curricular units 2nd sem (without evaluations)',
    'Unemployment rate',
    'Inflation rate', 
    'GDP'
]

cat_dropped_nonbinary_cols = [
    'Marital status', 
    'Application mode', 
    'Course',
    'Previous qualification',
    'Mother\'s qualification', 
    'Father\'s qualification',
    'Mother\'s occupation', 
    'Father\'s occupation', 
    'Application order']

cat_dropped_binary_cols = [
    'Daytime/evening attendance', 
    'Displaced',
    'Debtor', 
    'Tuition fees up to date',
    'Gender', 
    'Scholarship holder',
    'Target'
]

cat_dropped_cols = [
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 2nd sem (approved)',
    'Unemployment rate',
    'Inflation rate', 
    'GDP'
]

In [112]:
df.nunique()

Marital status                                      6
Application mode                                   18
Application order                                   8
Course                                             17
Daytime/evening attendance                          2
Previous qualification                             17
Nacionality                                        21
Mother's qualification                             29
Father's qualification                             34
Mother's occupation                                32
Father's occupation                                46
Displaced                                           2
Educational special needs                           2
Debtor                                              2
Tuition fees up to date                             2
Gender                                              2
Scholarship holder                                  2
Age at enrollment                                  46
International               

In [113]:
df_full_preprocessed = preprocess_data(df, num_cols, cat_cols)
df_full_preprocessed.to_csv('../data/students_TargetOHC.csv', index=False)

In [114]:
df_OHE_scaled = df.copy()

num_vars = df[cat_dropped_cols]
cat_vars = df[cat_dropped_nonbinary_cols]
bin_var = df[cat_dropped_binary_cols]

num_vars_scaled = scaling(num_vars, 'minmax')

cat_vars_encoded = one_hot_encoding(cat_vars, cat_dropped_nonbinary_cols)

df_OHE_scaled = pd.concat([num_vars_scaled, bin_var, cat_vars_encoded], axis=1)
df_OHE_scaled.to_csv('../data/OHE_scaled_target.csv', index=False)

df_scaled = pd.concat([num_vars, bin_var, cat_vars], axis=1)
df_scaled.to_csv('../data/scaled_target.csv', index=False)