# Pré-processamento

## Importações

In [35]:
import pandas as pd

from preprocess_funcs import *

## Aplicação

In [36]:
categorical_cols = ['Marital status', 
            'Application mode', 
            'Course',
            'Daytime/evening attendance', 
            'Previous qualification', 
            'Nacionality',
            'Mother\'s qualification', 
            'Father\'s qualification',
            'Mother\'s occupation', 
            'Father\'s occupation', 
            'Displaced',
            'Educational special needs', 
            'Debtor', 
            'Tuition fees up to date',
            'Gender', 
            'Scholarship holder', 
            'International',
            'Application order']

numerical_cols = ['Age at enrollment',
            'Curricular units 1st sem (credited)',
            'Curricular units 1st sem (enrolled)',
            'Curricular units 1st sem (evaluations)',
            'Curricular units 1st sem (approved)',
            'Curricular units 1st sem (grade)',
            'Curricular units 1st sem (without evaluations)',
            'Curricular units 2nd sem (credited)',
            'Curricular units 2nd sem (enrolled)',
            'Curricular units 2nd sem (evaluations)',
            'Curricular units 2nd sem (approved)',
            'Curricular units 2nd sem (grade)',
            'Curricular units 2nd sem (without evaluations)',
            'Unemployment rate',
            'Inflation rate', 
            'GDP']

target = ['Target']

### Amostra
Pré-processamento usado para a etapa de análise exploratória

In [37]:
size = '_10'
df_10 = pd.read_csv(f'../data/students{size}.csv')
filename = f"../data/preprocessed/{size}"

#### numéricas

In [38]:
num_transformations = {
    'num': (remove_unused, {
        'cols': categorical_cols
    }),
    'ohe': (one_hot_encoding, {
        'categorical_cols': target,
        'drop_first': False
    }),
}

preprocess_data(df_10, num_transformations, filename)

Arquivo salvo: ../data/preprocessed/_10_num.csv
Arquivo salvo: ../data/preprocessed/_10_ohe.csv
Arquivo salvo: ../data/preprocessed/_10_num_ohe.csv


#### categóricas

In [39]:
cat_transformations = {
    'cat': (remove_unused, {
        'cols': numerical_cols
    })
}

preprocess_data(df_10, cat_transformations, filename)

Arquivo salvo: ../data/preprocessed/_10_cat.csv


## Pré-processamento pós análise
Após a análise de de importancia das features, vamos optar por remover as menos importantes, ou que possuem uma alta correlação entre si

### Complemento

In [40]:
size = '_90'
df_90 = pd.read_csv(f'../data/students{size}.csv')

In [41]:
drop_cols = ['Marital status', 
             'Application order', 
             'Previous qualification', 
             'Nacionality', 
             "Father's qualification", 
             "Mother's occupation", 
             "Father's occupation", 
             'Educational special needs', 
             'International',
             'Curricular units 1st sem (credited)',
             'Curricular units 1st sem (enrolled)',
             'Curricular units 1st sem (grade)',
             'Curricular units 2nd sem (credited)',
             'Curricular units 2nd sem (enrolled)',
             'Curricular units 2nd sem (grade)',
             'Curricular units 1st sem (evaluations)',
             'Curricular units 1st sem (without evaluations)',
             'Curricular units 2nd sem (evaluations)',
             'Curricular units 2nd sem (without evaluations)']

ohe = ['Application mode', 
        'Course',
        'Daytime/evening attendance',
        'Mother\'s qualification', 
        'Displaced',
        'Debtor', 
        'Tuition fees up to date',
        'Gender', 
        'Scholarship holder',
        'Target']

transformations = {
    'drp': (remove_unused, {
        'cols': drop_cols
    }),
    'ohe': (one_hot_encoding, {
        'categorical_cols': ohe,
        'drop_first': False
    }),
}

preprocess_data(df_90, transformations, filename)

Arquivo salvo: ../data/preprocessed/_10_drp.csv
Arquivo salvo: ../data/preprocessed/_10_ohe.csv
Arquivo salvo: ../data/preprocessed/_10_drp_ohe.csv
