# Pré-processamento

## Importações

In [174]:
import pandas as pd

from preprocess_funcs import *

## Aplicação

In [175]:
categorical_cols = ['Marital status', 
            'Application mode', 
            'Course',
            'Daytime/evening attendance', 
            'Previous qualification', 
            'Nacionality',
            'Mother\'s qualification', 
            'Father\'s qualification',
            'Mother\'s occupation', 
            'Father\'s occupation', 
            'Displaced',
            'Educational special needs', 
            'Debtor', 
            'Tuition fees up to date',
            'Gender', 
            'Scholarship holder', 
            'International',
            'Application order']

numerical_cols = ['Age at enrollment',
            'Curricular units 1st sem (credited)',
            'Curricular units 1st sem (enrolled)',
            'Curricular units 1st sem (evaluations)',
            'Curricular units 1st sem (approved)',
            'Curricular units 1st sem (grade)',
            'Curricular units 1st sem (without evaluations)',
            'Curricular units 2nd sem (credited)',
            'Curricular units 2nd sem (enrolled)',
            'Curricular units 2nd sem (evaluations)',
            'Curricular units 2nd sem (approved)',
            'Curricular units 2nd sem (grade)',
            'Curricular units 2nd sem (without evaluations)',
            'Unemployment rate',
            'Inflation rate', 
            'GDP']

target = ['Target']

### Amostra
Pré-processamento usado para a etapa de análise exploratória

In [176]:
size = '_10'
df_10 = pd.read_csv(f'../data/students{size}.csv')
filename = f"../data/preprocessed/{size}"

#### numéricas

In [177]:
num_transformations = {
    'num': (remove_unused, {
        'cols': categorical_cols
    }),
    'ohe': (one_hot_encoding, {
        'categorical_cols': target,
        'drop_first': False
    }),
}

preprocess_data(df_10, num_transformations, filename)

Arquivo salvo: ../data/preprocessed/_10_num.csv
Arquivo salvo: ../data/preprocessed/_10_ohe.csv
Arquivo salvo: ../data/preprocessed/_10_num_ohe.csv


#### categóricas

In [178]:
cat_transformations = {
    'cat': (remove_unused, {
        'cols': numerical_cols
    })
}

preprocess_data(df_10, cat_transformations, filename)

Arquivo salvo: ../data/preprocessed/_10_cat.csv


## Pré-processamento pós análise
Após a análise de de importancia das features, vamos optar por remover as menos importantes, ou que possuem uma alta correlação entre si

### Complemento

In [179]:
size = '_90'
df_90 = pd.read_csv(f'../data/students{size}.csv')

In [180]:
df_90.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [181]:
drop_cols = [
    'Marital status', 
    'Application order', 
    'Previous qualification', 
    'Nacionality', 
    "Father's qualification", 
    "Mother's occupation", 
    "Father's occupation", 
    'Educational special needs', 
    'International',
    'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (credited)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (grade)',
    'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (without evaluations)'
]

ohe = [
    'Application mode', 
    'Course',
    'Daytime/evening attendance',
    'Mother\'s qualification', 
    'Displaced',
    'Debtor', 
    'Tuition fees up to date',
    'Gender', 
    'Scholarship holder',
    'Target'
]


df_90_drp = remove_unused(df_90, drop_cols)

In [182]:
df_90_drp.head()

Unnamed: 0,Application mode,Course,Daytime/evening attendance,Mother's qualification,Displaced,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,Curricular units 1st sem (approved),Curricular units 2nd sem (approved),Unemployment rate,Inflation rate,GDP,Target
0,8,2,1,13,1,0,1,1,0,20,0,0,10.8,1.4,1.74,Dropout
1,6,11,1,1,1,0,0,1,0,19,6,6,13.9,-0.3,0.79,Graduate
2,1,5,1,22,1,0,0,1,0,19,0,0,10.8,1.4,1.74,Dropout
3,8,15,1,23,1,0,1,0,0,20,6,5,9.4,-0.8,-3.12,Graduate
4,12,3,0,22,0,0,1,0,0,45,5,6,13.9,-0.3,0.79,Graduate


In [183]:
df_90_drp.describe()

Unnamed: 0,Application mode,Course,Daytime/evening attendance,Mother's qualification,Displaced,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,Curricular units 1st sem (approved),Curricular units 2nd sem (approved),Unemployment rate,Inflation rate,GDP
count,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0
mean,6.883225,9.884982,0.890758,12.317931,0.545957,0.114013,0.880713,0.349322,0.250377,23.278754,4.717228,4.450779,11.547062,1.221572,0.001883
std,5.307682,4.335738,0.311981,9.019802,0.497946,0.317867,0.324166,0.476816,0.433284,7.627845,3.100899,3.006977,2.651871,1.38296,2.267376
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,7.6,-0.8,-4.06
25%,1.0,6.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,19.0,3.0,2.0,9.4,0.3,-1.7
50%,8.0,10.0,1.0,13.0,1.0,0.0,1.0,0.0,0.0,20.0,5.0,5.0,11.1,1.4,0.32
75%,12.0,13.0,1.0,22.0,1.0,0.0,1.0,1.0,1.0,25.0,6.0,6.0,13.9,2.6,1.79
max,18.0,17.0,1.0,29.0,1.0,1.0,1.0,1.0,1.0,70.0,26.0,20.0,16.2,3.7,3.51


In [184]:
df_90_drp.columns

Index(['Application mode', 'Course', 'Daytime/evening attendance',
       'Mother's qualification', 'Displaced', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'Curricular units 1st sem (approved)',
       'Curricular units 2nd sem (approved)', 'Unemployment rate',
       'Inflation rate', 'GDP', 'Target'],
      dtype='object')

In [185]:
cat_nbin_cols = [
    'Application mode', 'Course',
    "Mother's qualification",
    'Target'
]

cat_bin_cols = [
    'Daytime/evening attendance', 
    'Displaced',
    'Debtor', 
    'Tuition fees up to date',
    'Gender', 
    'Scholarship holder',
]

num_cols = [
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 2nd sem (approved)',
    'Unemployment rate',
    'Inflation rate', 
    'GDP'
]

### Rebalanceamento

### One Hot Encoding e conversão em problema de classe binaria

In [186]:
cat_vars = df_90_drp[cat_nbin_cols]
bin_var = df_90_drp[cat_bin_cols]
num_vars = df_90_drp[num_cols]

target = cat_vars['Target']
target = target.apply(lambda x: 'Graduate_or_Enrolled' if x != 'Dropout' else x)
cat_vars = cat_vars.drop(['Target'], axis=1)
cat_vars = pd.concat([cat_vars, target], axis=1)

cat_vars_ohe = one_hot_encoding(cat_vars, cat_nbin_cols)
cat_vars_ohe = cat_vars_ohe.drop('Target_Graduate_or_Enrolled', axis=1)

cat_vars_ohe = cat_vars_ohe.rename(columns={'Target_Dropout': 'Dropout'})

### Rejunção

In [187]:
df_90_drp_ohe = pd.concat([cat_vars_ohe, bin_var, num_vars], axis=1)

df_90_drp_ohe.to_csv('../data/preprocessed/_90_drp_ohe.csv', index=False)