# Etapa de processamento e preparação dos dados 

### Carregando arquivos pré-processados

Imports

In [1]:
try:
  import dask.dataframe as dd
except ImportError:
  !pip install "dask[complete]" --upgrade
  !pip install dask_ml

  import os
  os.kill(os.getpid(), 9)

try:
  import fastparquet
except ModuleNotFoundError:
  !pip install fastparquet
  import fastparquet

import glob, string, os


Montando o Drive

In [2]:
if not os.path.exists('/content/drive'):
  from google.colab import drive
  drive.mount('/content/drive')

Identificando os arquivos já pré-processados

In [3]:
os.chdir("/content/drive/MyDrive/Colab Notebooks/IC_Data_Science_ENEM/Dados")
file_list = [file for file in glob.glob("PRE_PROCESSED*.pq")]

In [4]:
file_list

['PRE_PROCESSED_ENEM_2017.pq',
 'PRE_PROCESSED_ENEM_2018.pq',
 'PRE_PROCESSED_ENEM_2019.pq']

Separando as colunas que são comuns a todos os arquivos

In [5]:
list_temp = list()
for file in file_list:
  data_path = f'/content/drive/MyDrive/Colab Notebooks/IC_Data_Science_ENEM/Dados/{file}'
  df_temp = dd.read_parquet(data_path, sep=';', encoding='latin-1', assume_missing=True)
  list_temp.append(df_temp.columns.values.tolist())
  
list_to_read = sorted(set(list_temp[0]).intersection(*list_temp), key=list_temp[0].index)

Lendo todos os arquivos .parquet com apenas as colunas em comum para uma lista e depois concatená-la

In [6]:
data_path = [f'/content/drive/MyDrive/Colab Notebooks/IC_Data_Science_ENEM/Dados/{file}' for file in file_list]
list_df = [dd.read_parquet(path, columns=list_to_read) for path in data_path]

In [7]:
len(list_df)

3

In [8]:
df = dd.concat(list_df)

In [9]:
df.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,CO_MUNICIPIO_NASCIMENTO,NO_MUNICIPIO_NASCIMENTO,CO_UF_NASCIMENTO,SG_UF_NASCIMENTO,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,IN_TREINEIRO,IN_SEM_RECURSO,CO_MUNICIPIO_PROVA,NO_MUNICIPIO_PROVA,CO_UF_PROVA,SG_UF_PROVA,Q001,Q002,Q003,Q004,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,NU_NOTA_MEDIA,TP_PRESENCA,IN_ATEND_ESPECIALIZADO,IN_ATEND_ESPECIFICO,TP_UF_MUDANCA,TP_MUNICIPIO_MUDANCA
0,170003336736,2017,3503208,Araraquara,35,SP,29,F,0,1,1,3503208.0,Araraquara,35.0,SP,1,10,1,0,0,3503208,Araraquara,35,SP,E,D,C,B,3.0,D,A,B,C,A,B,B,A,C,A,B,A,A,C,B,B,C,B,B,B,536.28,1,0,0,0,0
1,170003333545,2017,5002902,Cassilândia,50,MS,22,F,0,1,1,5002902.0,Cassilândia,50.0,MS,1,5,1,0,0,5002902,Cassilândia,50,MS,C,E,B,B,4.0,C,A,B,D,B,B,B,A,A,A,B,A,A,B,A,A,C,B,A,A,,0,0,0,0,0
2,170001663644,2017,3550308,São Paulo,35,SP,38,F,0,1,1,3550308.0,São Paulo,35.0,SP,1,10,1,0,0,3550308,São Paulo,35,SP,A,A,F,B,4.0,G,A,B,C,A,A,B,A,B,A,B,A,A,B,A,A,C,A,B,B,,0,0,0,0,0
3,170001663645,2017,4209300,Lages,42,SC,35,F,0,1,1,4209300.0,Lages,42.0,SC,1,6,1,0,0,4209300,Lages,42,SC,E,D,B,B,4.0,D,A,B,D,B,A,B,A,B,A,B,A,B,C,A,B,D,A,B,B,589.9,1,0,0,0,0
4,170001663646,2017,2704302,Maceió,27,AL,40,M,0,3,1,2704302.0,Maceió,27.0,AL,1,10,1,0,0,2704302,Maceió,27,AL,A,B,B,F,3.0,C,A,B,B,A,B,B,A,A,A,A,A,A,B,B,A,C,A,C,B,557.34,1,0,0,0,0


Verificando os dados

In [10]:
# tamanho do dataset
len(df)

16850882

In [11]:
# Somente a coluna de notas com valores nulos
df[df.columns[df.isna().any().compute()]]

Unnamed: 0_level_0,NU_NOTA_MEDIA
npartitions=168,Unnamed: 1_level_1
,float64
,...
...,...
,...
,...


### Preparação e Processamento de Dados

É necessário transformar os dados de colunas para que sejam entradas válidas nos algoritmos de Machine Learning.

Este conjunto de dados é recheado de variáveis categóricas que necessitam de tratamento especial. Assim, classificam-se as colunas (de acordo com a definição presente no arquivo de dicionário de variáveis, fornecido pelo Inep):

*   **Quantitativas**: NU_IDADE (discreta), NU_NOTA_MEDIA (contínua), Q005 (discreta), TP_ANO_CONCLUIU (discreta - representando a quantidade de anos entre a prova e a conclusão do ensino médio)

*   Categóricas **dicotômicas**: TP_SEXO,  IN_TREINEIRO, IN_SEM_RECURSO, TP_PRESENCA, IN_ATEND_ESPECIALIZADO, IN_ATEND_ESPECIFICO, TP_UF_MUDANCA, TP_MUNICIPIO_MUDANCA, Q018, Q020, Q021, Q023, Q025

*   Categóricas **nominais**: CO_UF_RESIDENCIA,  TP_ESTADO_CIVIL, TP_COR_RACA, TP_NACIONALIDADE, CO_UF_NASCIMENTO, TP_ST_CONCLUSAO, TP_ESCOLA, CO_UF_PROVA, Q003, Q004  

*   Categóricas **ordinais**: Q001, Q002, Q006, Q007, Q008, Q009, Q010, Q011, Q012, Q013, Q014, Q015, Q016, Q017, Q019, Q022, Q024

Dentre as variáveis categóricas dicotômicas, as que ainda não foram ajustadas são: Q018, Q020, Q021, Q023, Q025.

Nenhuma das variáveis categóricas nominais ou ordinais foram ajustadas ainda.

Excluindo colunas. Todas as colunas não mencionadas não serão utilizadas para fim de aprendizado de máquina.

In [12]:
columns_to_drop = ['CO_MUNICIPIO_NASCIMENTO', 'NO_MUNICIPIO_NASCIMENTO', 'CO_UF_NASCIMENTO', 'SG_UF_NASCIMENTO',
                   'CO_MUNICIPIO_RESIDENCIA', 'NO_MUNICIPIO_RESIDENCIA', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA',
                   'CO_MUNICIPIO_PROVA', 'NO_MUNICIPIO_PROVA', 'CO_UF_PROVA', 'NU_INSCRICAO']
df = df.drop(labels=columns_to_drop, axis=1, errors='ignore')

**Quantitativas**

In [13]:
dtypes = {
    'NU_IDADE': 'int8',  'NU_NOTA_MEDIA': 'float16',  'Q005': 'int8',  'TP_ANO_CONCLUIU': 'int8'
}
df = df.astype(dtypes)

**Dicotômicas**

In [14]:
dtypes = {
    'IN_TREINEIRO': 'int8', 'IN_SEM_RECURSO': 'int8', 
    'IN_ATEND_ESPECIALIZADO': 'int8',
    'IN_ATEND_ESPECIFICO': 'int8'
}
df = df.astype(dtypes)

In [15]:
columns = ['Q018', 'Q020', 'Q021', 'Q023', 'Q025']
df[columns] = df[columns].replace({'A': 0, 'B': 1})
df['TP_SEXO'] = df['TP_SEXO'].replace(to_replace=['M', 'F'], value=[0, 1])

df[columns] = df[columns].astype('int8')
df['TP_SEXO'] = df['TP_SEXO'].astype('int8')

In [16]:
df.head()

Unnamed: 0,NU_ANO,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,IN_TREINEIRO,IN_SEM_RECURSO,SG_UF_PROVA,Q001,Q002,Q003,Q004,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,NU_NOTA_MEDIA,TP_PRESENCA,IN_ATEND_ESPECIALIZADO,IN_ATEND_ESPECIFICO,TP_UF_MUDANCA,TP_MUNICIPIO_MUDANCA
0,2017,29,1,0,1,1,1,10,1,0,0,SP,E,D,C,B,3,D,A,B,C,A,B,B,A,C,A,B,A,0,C,1,1,C,1,B,1,536.5,1,0,0,0,0
1,2017,22,1,0,1,1,1,5,1,0,0,MS,C,E,B,B,4,C,A,B,D,B,B,B,A,A,A,B,A,0,B,0,0,C,1,A,0,,0,0,0,0,0
2,2017,38,1,0,1,1,1,10,1,0,0,SP,A,A,F,B,4,G,A,B,C,A,A,B,A,B,A,B,A,0,B,0,0,C,0,B,1,,0,0,0,0,0
3,2017,35,1,0,1,1,1,6,1,0,0,SC,E,D,B,B,4,D,A,B,D,B,A,B,A,B,A,B,A,1,C,0,1,D,0,B,1,590.0,1,0,0,0,0
4,2017,40,0,0,3,1,1,10,1,0,0,AL,A,B,B,F,3,C,A,B,B,A,B,B,A,A,A,A,A,0,B,1,0,C,0,C,1,557.5,1,0,0,0,0


**Categóricas Ordinais**

In [17]:
keys = list(string.ascii_uppercase[0:17])
values = list(range(0,17))
categories = dict(zip(keys, values))

#print(categories)

In [18]:
col_cat_ord = ['Q001', 'Q002', 'Q006', 'Q007', 'Q008', 'Q009', 'Q010', 'Q011', 'Q012', 'Q013', 'Q014',
               'Q015', 'Q016', 'Q017', 'Q019', 'Q022', 'Q024']
df[col_cat_ord] = df[col_cat_ord].replace(categories)
df[col_cat_ord] = df[col_cat_ord].astype('int8')

In [19]:
df.head()

Unnamed: 0,NU_ANO,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,IN_TREINEIRO,IN_SEM_RECURSO,SG_UF_PROVA,Q001,Q002,Q003,Q004,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,NU_NOTA_MEDIA,TP_PRESENCA,IN_ATEND_ESPECIALIZADO,IN_ATEND_ESPECIFICO,TP_UF_MUDANCA,TP_MUNICIPIO_MUDANCA
0,2017,29,1,0,1,1,1,10,1,0,0,SP,4,3,C,B,3,3,0,1,2,0,1,1,0,2,0,1,0,0,2,1,1,2,1,1,1,536.5,1,0,0,0,0
1,2017,22,1,0,1,1,1,5,1,0,0,MS,2,4,B,B,4,2,0,1,3,1,1,1,0,0,0,1,0,0,1,0,0,2,1,0,0,,0,0,0,0,0
2,2017,38,1,0,1,1,1,10,1,0,0,SP,0,0,F,B,4,6,0,1,2,0,0,1,0,1,0,1,0,0,1,0,0,2,0,1,1,,0,0,0,0,0
3,2017,35,1,0,1,1,1,6,1,0,0,SC,4,3,B,B,4,3,0,1,3,1,0,1,0,1,0,1,0,1,2,0,1,3,0,1,1,590.0,1,0,0,0,0
4,2017,40,0,0,3,1,1,10,1,0,0,AL,0,1,B,F,3,2,0,1,1,0,1,1,0,0,0,0,0,0,1,1,0,2,0,2,1,557.5,1,0,0,0,0


**Categóricas nominais**

In [20]:
from dask_ml.preprocessing import DummyEncoder

In [21]:
dtypes = {
    'TP_ESTADO_CIVIL': 'category', 'TP_COR_RACA': 'category', 'TP_NACIONALIDADE': 'category',
     'TP_ST_CONCLUSAO': 'category', 'TP_ESCOLA': 'category'
}
df = df.astype(dtypes)

In [22]:
%%time
col_cat_nom = ['TP_ESTADO_CIVIL', 'TP_COR_RACA', 'TP_NACIONALIDADE',
               'TP_ST_CONCLUSAO', 'TP_ESCOLA', 'SG_UF_PROVA', 'Q003', 'Q004']
df[col_cat_nom] = df[col_cat_nom].categorize()

CPU times: user 8min 38s, sys: 13.2 s, total: 8min 51s
Wall time: 7min 14s


In [23]:
## ERROR ##
enc = DummyEncoder()
trn = enc.fit_transform(df[col_cat_nom])

In [24]:
df = dd.concat([df, trn.categorize()], axis=1, ignore_unknown_divisions=True)
df = df.drop(columns=col_cat_nom, errors='ignore')
df['NU_NOTA_MEDIA'] = df['NU_NOTA_MEDIA'].fillna(0)

In [25]:
df.head()

Unnamed: 0,NU_ANO,NU_IDADE,TP_SEXO,TP_ANO_CONCLUIU,IN_TREINEIRO,IN_SEM_RECURSO,Q001,Q002,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,NU_NOTA_MEDIA,TP_PRESENCA,IN_ATEND_ESPECIALIZADO,IN_ATEND_ESPECIFICO,TP_UF_MUDANCA,TP_MUNICIPIO_MUDANCA,TP_ESTADO_CIVIL_0,TP_ESTADO_CIVIL_1,TP_ESTADO_CIVIL_2,TP_ESTADO_CIVIL_3,TP_ESTADO_CIVIL_4,...,TP_ESCOLA_4,SG_UF_PROVA_SP,SG_UF_PROVA_MS,SG_UF_PROVA_SC,SG_UF_PROVA_AL,SG_UF_PROVA_RS,SG_UF_PROVA_PR,SG_UF_PROVA_PE,SG_UF_PROVA_BA,SG_UF_PROVA_MT,SG_UF_PROVA_CE,SG_UF_PROVA_PI,SG_UF_PROVA_MG,SG_UF_PROVA_GO,SG_UF_PROVA_AP,SG_UF_PROVA_ES,SG_UF_PROVA_RN,SG_UF_PROVA_MA,SG_UF_PROVA_AM,SG_UF_PROVA_DF,SG_UF_PROVA_RJ,SG_UF_PROVA_SE,SG_UF_PROVA_PB,SG_UF_PROVA_TO,SG_UF_PROVA_AC,SG_UF_PROVA_PA,SG_UF_PROVA_RO,SG_UF_PROVA_RR,Q003_C,Q003_B,Q003_F,Q003_A,Q003_D,Q003_E,Q004_B,Q004_F,Q004_D,Q004_A,Q004_C,Q004_E
0,2017,29,1,10,0,0,4,3,3,3,0,1,2,0,1,1,0,2,0,1,0,0,2,1,1,2,1,1,1,536.5,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,2017,22,1,5,0,0,2,4,4,2,0,1,3,1,1,1,0,0,0,1,0,0,1,0,0,2,1,0,0,0.0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
2,2017,38,1,10,0,0,0,0,4,6,0,1,2,0,0,1,0,1,0,1,0,0,1,0,0,2,0,1,1,0.0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3,2017,35,1,6,0,0,4,3,4,3,0,1,3,1,0,1,0,1,0,1,0,1,2,0,1,3,0,1,1,590.0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
4,2017,40,0,10,0,0,0,1,3,2,0,1,1,0,1,1,0,0,0,0,0,0,1,1,0,2,0,2,1,557.5,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [26]:
df.dtypes.unique()

array([dtype('int64'), dtype('int8'), dtype('float16'), dtype('uint8')],
      dtype=object)

### Salvando arquivo

In [27]:
%%time
save_path = '/content/drive/MyDrive/Colab Notebooks/IC_Data_Science_ENEM/Dados/PROCESSED_ENEM.pq'
dd.to_parquet(df, path=save_path, write_index=False)

CPU times: user 9min 40s, sys: 27.2 s, total: 10min 8s
Wall time: 9min 5s


In [28]:
test = dd.read_parquet(save_path)

In [29]:
test.head()

Unnamed: 0,NU_ANO,NU_IDADE,TP_SEXO,TP_ANO_CONCLUIU,IN_TREINEIRO,IN_SEM_RECURSO,Q001,Q002,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,NU_NOTA_MEDIA,TP_PRESENCA,IN_ATEND_ESPECIALIZADO,IN_ATEND_ESPECIFICO,TP_UF_MUDANCA,TP_MUNICIPIO_MUDANCA,TP_ESTADO_CIVIL_0,TP_ESTADO_CIVIL_1,TP_ESTADO_CIVIL_2,TP_ESTADO_CIVIL_3,TP_ESTADO_CIVIL_4,...,TP_ESCOLA_4,SG_UF_PROVA_SP,SG_UF_PROVA_MS,SG_UF_PROVA_SC,SG_UF_PROVA_AL,SG_UF_PROVA_RS,SG_UF_PROVA_PR,SG_UF_PROVA_PE,SG_UF_PROVA_BA,SG_UF_PROVA_MT,SG_UF_PROVA_CE,SG_UF_PROVA_PI,SG_UF_PROVA_MG,SG_UF_PROVA_GO,SG_UF_PROVA_AP,SG_UF_PROVA_ES,SG_UF_PROVA_RN,SG_UF_PROVA_MA,SG_UF_PROVA_AM,SG_UF_PROVA_DF,SG_UF_PROVA_RJ,SG_UF_PROVA_SE,SG_UF_PROVA_PB,SG_UF_PROVA_TO,SG_UF_PROVA_AC,SG_UF_PROVA_PA,SG_UF_PROVA_RO,SG_UF_PROVA_RR,Q003_C,Q003_B,Q003_F,Q003_A,Q003_D,Q003_E,Q004_B,Q004_F,Q004_D,Q004_A,Q004_C,Q004_E
0,2017,29,1,10,0,0,4,3,3,3,0,1,2,0,1,1,0,2,0,1,0,0,2,1,1,2,1,1,1,536.5,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,2017,22,1,5,0,0,2,4,4,2,0,1,3,1,1,1,0,0,0,1,0,0,1,0,0,2,1,0,0,0.0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
2,2017,38,1,10,0,0,0,0,4,6,0,1,2,0,0,1,0,1,0,1,0,0,1,0,0,2,0,1,1,0.0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3,2017,35,1,6,0,0,4,3,4,3,0,1,3,1,0,1,0,1,0,1,0,1,2,0,1,3,0,1,1,590.0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
4,2017,40,0,10,0,0,0,1,3,2,0,1,1,0,1,1,0,0,0,0,0,0,1,1,0,2,0,2,1,557.5,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [30]:
len(test)

16850882