# Scripts de pré-processamento

In [53]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, RobustScaler, MaxAbsScaler, PowerTransformer

In [54]:
def hybrid_balancing(X: pd.DataFrame, y: pd.Series, tomek: str='majority', smote: str='not majority') -> pd.DataFrame:
    """
    Retorna um DataFrame com as classes balanceadas das seguinte forma:
    1. Reduz a classe majoritária usando liagacoes de TOMEK (remover instancias que nao adicionam muita informacao).
    2. Equaliza as demais classes usando SMOTE para ficarem com a mesma quantidade de instancias que a classe majoritaria.

    :param X: DataFrame com as variaveis independentes
    :X type: pd.DataFrame
    :param y: Series com a variavel dependente
    :y type: pd.Series
    :param tomek: tipo de undersampling a ser feito pelo TOMEK
    :tomek type: str
    :param smote: tipo de oversampling a ser feito pelo SMOTE
    :smote type: str
    :return: DataFrame transformado
    :rtype: pd.Dataframe
    """

    tl = TomekLinks(sampling_strategy=tomek)
    X_tl, y_tl = tl.fit_resample(X, y)

    sm = SMOTE(sampling_strategy=smote)
    X_tl_sm, y_tl_sm = sm.fit_resample(X_tl, y_tl)

    df_tl_sm = pd.concat([X_tl_sm, y_tl_sm], axis=1)
    
    return df_tl_sm

In [55]:
def scaling(num_vars: pd.DataFrame, scaler: str='minmax') -> pd.DataFrame:
    """
    metodo que detecta qual o melhor scaler a ser usado.

    :param df: DataFrame alvo das transformacoes
    :df type: pd.DataFrame
    :param cols: Lista de colunas que devem ser transformadas
    :cols type: list
    :return: DataFrame transformado
    :rtype: pd.Dataframe
    """

    if scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'normalizer':
        scaler = Normalizer()
    elif scaler == 'robust':
        scaler = RobustScaler()
    else:
        raise ValueError(f"Scaler '{scaler}' não é reconhecido. Use 'minmax', 'standard', 'normalizer', ou 'robust'.")
    
    return pd.DataFrame(scaler.fit_transform(num_vars), columns=num_vars.columns)


In [56]:
def one_hot_encoding(cat_vars: pd.DataFrame, cat_cols: list) -> pd.DataFrame:
    """
    Executa o One Hot Encoding em cada coluna fornecida de um DataFrame.

    :param df: DataFrame alvo das transformacoes
    :df type: pd.DataFrame
    :param cols: Lista de colunas que devem ser transformadas
    :cols type: list
    :return: DataFrame transformado
    :rtype: pd.Dataframe
    """

    one_hot_encoder = OneHotEncoder(sparse_output=False)
    return pd.DataFrame(one_hot_encoder.fit_transform(cat_vars), columns=one_hot_encoder.get_feature_names_out(cat_cols))

In [None]:
def remove_cols(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    return df.drop(columns=cols)

In [57]:
def preprocess_data(df: pd.DataFrame, num_cols: list, cat_cols: list, scaler='minmax') -> pd.DataFrame:
    df = remove_cols()
    
    X = df.drop(columns=['Target'])
    y = df['Target']

    df = hybrid_balancing(X, y)

    num_vars = df[num_cols]
    cat_vars = df[cat_cols]
    
    num_vars_scaled = scaling(num_vars, scaler)

    cat_vars_encoded = one_hot_encoding(cat_vars, cat_cols)

    df_balanced_scaled_encoded = pd.concat([num_vars_scaled, cat_vars_encoded], axis=1)

    return df_balanced_scaled_encoded

## Aplicação

In [58]:
df = pd.read_csv('../data/students.csv')

In [59]:
cat_cols = ['Marital status', 
            'Application mode', 
            'Course',
            'Daytime/evening attendance', 
            'Previous qualification', 
            'Nacionality',
            'Mother\'s qualification', 
            'Father\'s qualification',
            'Mother\'s occupation', 
            'Father\'s occupation', 
            'Displaced',
            'Educational special needs', 
            'Debtor', 
            'Tuition fees up to date',
            'Gender', 
            'Scholarship holder', 
            'International',
            'Curricular units 1st sem (credited)',
            'Curricular units 1st sem (enrolled)',
            'Curricular units 1st sem (evaluations)',
            'Curricular units 1st sem (approved)',
            'Curricular units 1st sem (grade)',
            'Curricular units 1st sem (without evaluations)',
            'Curricular units 2nd sem (credited)',
            'Curricular units 2nd sem (enrolled)',
            'Curricular units 2nd sem (evaluations)',
            'Curricular units 2nd sem (approved)',
            'Curricular units 2nd sem (grade)',
            'Curricular units 2nd sem (without evaluations)', 
            'Unemployment rate',
            'Inflation rate', 
            'GDP',
            'Application order',
            'Target']            

num_cols = ['Age at enrollment',
         'Curricular units 1st sem (credited)',
         'Curricular units 1st sem (enrolled)',
         'Curricular units 1st sem (evaluations)',
         'Curricular units 1st sem (approved)',
         'Curricular units 1st sem (grade)',
         'Curricular units 1st sem (without evaluations)',
         'Curricular units 2nd sem (credited)',
         'Curricular units 2nd sem (enrolled)',
         'Curricular units 2nd sem (evaluations)',
         'Curricular units 2nd sem (approved)',
         'Curricular units 2nd sem (grade)',
         'Curricular units 2nd sem (without evaluations)']



In [60]:
df = preprocess_data(df, num_cols, cat_cols)

In [61]:
df.head()

Unnamed: 0,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),...,Application order_1,Application order_2,Application order_3,Application order_4,Application order_5,Application order_6,Application order_9,Target_Dropout,Target_Enrolled,Target_Graduate
0,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.037736,0.0,0.230769,0.133333,0.230769,0.741722,0.0,0.0,0.26087,0.181818,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.26087,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.056604,0.0,0.230769,0.177778,0.230769,0.711447,0.0,0.0,0.26087,0.30303,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.528302,0.0,0.230769,0.2,0.192308,0.653422,0.0,0.0,0.26087,0.181818,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5886 entries, 0 to 5885
Columns: 8469 entries, Age at enrollment to Target_Graduate
dtypes: float64(8469)
memory usage: 380.3 MB


In [63]:
df.describe()

Unnamed: 0,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),...,Application order_1,Application order_2,Application order_3,Application order_4,Application order_5,Application order_6,Application order_9,Target_Dropout,Target_Enrolled,Target_Graduate
count,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,...,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0,5886.0
mean,0.117077,0.03098,0.234298,0.184611,0.165987,0.550975,0.010916,0.024518,0.263684,0.246764,...,0.705063,0.121645,0.069487,0.050799,0.030071,0.022766,0.00017,0.333333,0.333333,0.333333
std,0.14053,0.107005,0.089297,0.090134,0.113186,0.25727,0.055433,0.09151,0.090334,0.118651,...,0.456053,0.326903,0.254302,0.219605,0.170798,0.149169,0.013034,0.471445,0.471445,0.471445
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.037736,0.0,0.192308,0.133333,0.076923,0.580198,0.0,0.0,0.217391,0.181818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.056604,0.0,0.230769,0.177778,0.192308,0.640593,0.0,0.0,0.26087,0.242424,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.150943,0.0,0.230769,0.222222,0.230769,0.697572,0.0,0.0,0.304348,0.30303,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [64]:
#df.to_csv('../data/students_TargetOHC.csv', index=False)