In [3]:
import pandas as pd
import numpy as np
import zipfile
import requests
from io import BytesIO
import os

In [None]:
# CONSTANTS
URL_DOWNLOAD = "http://download.inep.gov.br/microdados/microdados_enem_2019.zip"
EXTRACT_FOLDER_PATH = "./enem2019"
ENEM_RAW_CSV_PATH = "./enem2019/microdados_enem_2019/DADOS/MICRODADOS_ENEM_2019.csv"
ENEM_MG_CSV_PATH = "./enem2019/enem_mg/enem_mg.csv"

In [2]:
# Creating folder to extract the enem data
os.makedirs(EXTRACT_FOLDER_PATH, exist_ok=True)

In [None]:
def download_data(url: str, folder_path: str):
    file_bytes = BytesIO(
        requests.get(url).content
    )
    my_zip = zipfile.ZipFile(file_bytes)
    my_zip.extractall(folder_path)

In [49]:
def extract_enem_mg(csv_path: str, sep: str, decimal: str, encoding: str, chunksize: int):
    iter_csv = pd.read_csv(csv_path, sep=sep, decimal=decimal, encoding=encoding, iterator=True, chunksize=1000)
    enem_mg = pd.concat([chunk[chunk['SG_UF_NASCIMENTO']  == 'MG'] for chunk in iter_csv])

In [51]:
def generate_enem_mg_csv(df, csv_path: str, sep: str, decimal: str, encoding: str):
    df.to_csv(csv_path, sep=sep, decimal=decimal, encoding=encoding)

In [None]:
# Downloading enem 2019 data
download_data(
    URL_DOWNLOAD,
    EXTRACT_FOLDER_PATH
)

In [50]:
# Extracting data from csv divided by chunks
extract_enem_mg(ENEM_RAW_CSV_PATH, sep=";", decimal=",", encoding="ISO-8859-1", chunksize=1000)
enem_mg.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
336,190001004963,2019,1501402,Belém,15,PA,28,F,1,1,...,A,A,A,A,A,A,B,A,B,B
925,190001005552,2019,1505536,Parauapebas,15,PA,18,M,1,3,...,B,A,B,B,B,B,A,A,B,B
1044,190001005671,2019,3170206,Uberlândia,31,MG,21,M,1,1,...,B,A,A,D,A,A,D,B,B,B
1166,190001005793,2019,1506807,Santarém,15,PA,17,F,1,3,...,B,A,A,E,A,B,A,A,B,B
1281,190001005908,2019,1501402,Belém,15,PA,18,M,1,3,...,B,A,A,C,A,B,E,B,C,B


In [53]:
# Generate CSV only with students born in MG
generate_enem_mg_csv(enem_mg, ENEM_MG_CSV_PATH, sep=";", decimal=",", encoding='ISO-8859-1')