In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import pandas as pd
from zipfile import ZipFile
from sklearn.model_selection import train_test_split

PENSE_DATA = Path('../data/raw/PENSE_2019/PENSE_2019.zip').resolve()
zip_pense = ZipFile(PENSE_DATA)
path_unzip = zip_pense.namelist()[0]
df = pd.read_csv(zip_pense.open(path_unzip), sep=",")

In [2]:
print("O dataset PENSE possui {} linhas e {} colunas (features).".format(*df.shape))
num_nan_rows = df.isna().any(axis=1).sum()
print("O numero de linhas que possuem pelo menos um dados faltante é {}, {:02f}% do total.".format(num_nan_rows, 100*num_nan_rows/df.shape[0]))

O dataset PENSE possui 165838 linhas e 306 colunas (features).
O numero de linhas que possuem pelo menos um dados faltante é 6593, 3.975567% do total.


Como a quantidade de dados faltantes é pequena, vamos remover estes registros da base, nota-se que estamos contando os valores nulos (NaN), o questionário ainda pode ter respostas pouco conclusivas normalmente atribuído ao código "9" = "Sem resposta".

In [3]:
# Removendo linhas com valores nulos
df = df.dropna()
print(df.shape)

(159245, 306)


In [4]:
df.head(10)

Unnamed: 0,REGIAO,UF,MUNICIPIO_CAP,TIPO_MUNIC,SITUACAO,DEP_ADMIN,ESFERA,ESCOLA,TURMA,ALUNO,...,E01P65A,E01P66A,E01P26A,E01P27A,ESTRATO,IND_EXPANSAO,PESO_ALUNO_FREQ,PESO_INICIAL,POSEST,TOTAIS_POSEST
0,1,11,0,2.0,1.0,2.0,4.0,4,1,21,...,2.0,2.0,4.0,2.0,1101,1,4.361431,3.409091,1100000PRI1,1767
1,1,11,0,2.0,1.0,2.0,4.0,23,1,9,...,1.0,2.0,4.0,2.0,1101,1,3.83806,3.0,1100000PRI1,1767
2,1,11,0,2.0,1.0,2.0,4.0,2,1,3,...,2.0,1.0,4.0,2.0,1101,1,2.934987,2.294118,1100000PRI1,1767
3,1,11,0,2.0,1.0,2.0,4.0,18,1,19,...,2.0,2.0,4.0,2.0,1101,1,4.786546,3.741379,1100000PRI1,1767
4,1,11,0,2.0,2.0,2.0,4.0,8,1,2,...,1.0,1.0,4.0,2.0,1101,1,4.535889,3.545455,1100000PRI1,1767
5,1,11,0,2.0,1.0,2.0,4.0,22,1,2,...,1.0,2.0,4.0,2.0,1101,1,2.771932,2.166667,1100000PRI1,1767
6,1,11,0,2.0,1.0,2.0,4.0,3,1,18,...,2.0,1.0,4.0,2.0,1101,1,7.164378,5.6,1100000PRI1,1767
7,1,11,0,2.0,1.0,2.0,4.0,17,1,7,...,2.0,2.0,4.0,2.0,1101,1,2.771932,2.166667,1100000PRI1,1767
8,1,11,0,2.0,1.0,2.0,4.0,2,1,9,...,2.0,1.0,4.0,2.0,1101,1,2.934987,2.294118,1100000PRI1,1767
9,1,11,0,2.0,1.0,2.0,4.0,17,1,3,...,2.0,2.0,4.0,2.0,1101,1,2.771932,2.166667,1100000PRI1,1767


Repara-se que temos colunas no formato float64 que deveriam ser inteira, podemos convertê-las, sabendo não haver mais possibilidade para NaN.

In [5]:
df = df.convert_dtypes()
print(df.dtypes)
df.head(10)

REGIAO                      Int64
UF                          Int64
MUNICIPIO_CAP               Int64
TIPO_MUNIC                  Int64
SITUACAO                    Int64
                        ...      
IND_EXPANSAO                Int64
PESO_ALUNO_FREQ           Float64
PESO_INICIAL              Float64
POSEST             string[python]
TOTAIS_POSEST               Int64
Length: 306, dtype: object


Unnamed: 0,REGIAO,UF,MUNICIPIO_CAP,TIPO_MUNIC,SITUACAO,DEP_ADMIN,ESFERA,ESCOLA,TURMA,ALUNO,...,E01P65A,E01P66A,E01P26A,E01P27A,ESTRATO,IND_EXPANSAO,PESO_ALUNO_FREQ,PESO_INICIAL,POSEST,TOTAIS_POSEST
0,1,11,0,2,1,2,4,4,1,21,...,2,2,4,2,1101,1,4.361431,3.409091,1100000PRI1,1767
1,1,11,0,2,1,2,4,23,1,9,...,1,2,4,2,1101,1,3.83806,3.0,1100000PRI1,1767
2,1,11,0,2,1,2,4,2,1,3,...,2,1,4,2,1101,1,2.934987,2.294118,1100000PRI1,1767
3,1,11,0,2,1,2,4,18,1,19,...,2,2,4,2,1101,1,4.786546,3.741379,1100000PRI1,1767
4,1,11,0,2,2,2,4,8,1,2,...,1,1,4,2,1101,1,4.535889,3.545455,1100000PRI1,1767
5,1,11,0,2,1,2,4,22,1,2,...,1,2,4,2,1101,1,2.771932,2.166667,1100000PRI1,1767
6,1,11,0,2,1,2,4,3,1,18,...,2,1,4,2,1101,1,7.164378,5.6,1100000PRI1,1767
7,1,11,0,2,1,2,4,17,1,7,...,2,2,4,2,1101,1,2.771932,2.166667,1100000PRI1,1767
8,1,11,0,2,1,2,4,2,1,9,...,2,1,4,2,1101,1,2.934987,2.294118,1100000PRI1,1767
9,1,11,0,2,1,2,4,17,1,3,...,2,2,4,2,1101,1,2.771932,2.166667,1100000PRI1,1767


Vamos embaralhar estes dados e salva-los na pasta processed/PENSE_2019/all.zip

In [6]:
df_all = df.sample(frac=1, random_state=42).reset_index(drop=True)
df_all.to_csv('../data/processed/PENSE_2019/all.zip', index=False, compression=dict(method='zip', archive_name='data.csv', compresslevel=9))

Dividindo as base em treino e teste

In [7]:
df_train, df_test = train_test_split(df_all, test_size=0.5, random_state=42)
df_train.to_csv('../data/processed/PENSE_2019/train.zip', index=False, compression=dict(method='zip', archive_name='data.csv', compresslevel=9))
df_test.to_csv('../data/processed/PENSE_2019/test.zip', index=False, compression=dict(method='zip', archive_name='data.csv', compresslevel=9))