## 1.2 Extração do Texto das Provas do ENEM 2015 - 2020

* **Input**: arquivos PDFs das provas dos respectivos anos + Arquivos CSV com microdados dos respectivos anos

* **Output**: CSV com a extração dos dados mantendo as colunas originais

In [1]:

import pandas as pd
import numpy as np
import re
import pymupdf

from PyPDF2 import PdfReader

# Funções Utilizadas

In [2]:
# função que realiza a leitura dos microdados

def reading_data(path_to_data: str, year: str) -> pd.DataFrame:

    df = pd.read_csv(path_to_data, sep=';', encoding='latin-1')
    
    df = df[(df['SG_AREA'] == 'CH') & (df['TX_COR'].str.lower() == 'azul')]
    df = df[df['CO_PROVA'] == df['CO_PROVA'].min()]
    df = df[['CO_POSICAO', 'TX_GABARITO', 'NU_PARAM_A', 'NU_PARAM_B', 'NU_PARAM_C']].copy()
    
    df['ANO'] = year

    return df

NameError: name 'pd' is not defined

In [None]:
# Função que recebe o caminho do pdf, a página de início e a de fim e retorna as questões


def extract_questions(full_text: str, caps_lock: bool = False) -> list[str]:
    delimiter = "QUESTÃO " if caps_lock else "Questão "
    sections = full_text.split(delimiter)
    questions = [s[3:] for s in sections if s[:2].isnumeric()]
    return questions


def get_questions(pdf_path: str, start_page: int, end_page: int, caps_lock: bool = False) -> list[str]:
    reader = PdfReader(pdf_path)
    final_questions = []

    for page_num in range(start_page, end_page):
        full_text = reader.pages[page_num].extract_text() or ""
        final_questions.extend(extract_questions(full_text, caps_lock))

    return final_questions

In [None]:
# Função que retorna as questões sem as alternativas e as alternativas formatadas

def format_alt(alt: str) -> str:
    match = re.search(r"(A\s.+?)(B\s.+?)(C\s.+?)(D\s.+?)(E\s.+)", alt)
    if match:
        groups = match.groups()
        return '; '.join([f"{item[0]}: {item[2:-1]}" for item in groups])
    return ''

def get_alternatives(questions: list[str]) -> tuple[list[str], list[str]]:
    formatted_questions = []
    formatted_alternatives = []

    for question in questions:
        parts = re.split(r"(\nA\s.*\n)", question)

        alternatives_text = ''.join(parts[-2:]).replace('\n', '')
        alternatives_text = re.sub(r"\*.*\*", "", alternatives_text)
        alternatives_text = re.sub(r"(CH).*\d", "", alternatives_text)
        alternatives = format_alt(alternatives_text)

        question_text = ''.join(parts[:-2]).replace('\n', '')

        formatted_questions.append(question_text)
        formatted_alternatives.append(alternatives)

    return formatted_questions, formatted_alternatives

In [None]:
# Função auxiliar para criar o dataset com as novas colunas

def merge_quests_alts(dataset: pd.DataFrame, questions: list[str], alts: list[str]) -> pd.DataFrame:
    dataset['QUESTOES'] = questions
    dataset['ALTERNATIVAS'] = alts

    return dataset

# Leitura dos Microdados

In [None]:
df_2015 = reading_data('../data/microdados/microdados_2015.csv', '2015')

In [None]:
df_2016 = reading_data('../data/microdados/microdados_2015.csv', '2016')

In [None]:
df_2017 = reading_data('../data/microdados/microdados_2015.csv', '2017')

# Leituras da Provas

In [None]:
# Caminho da prova ENEM 2015
prova_2015 = '../data/provas/ENEM_2015.pdf'

# Caminho da prova ENEM 2016
prova_2016 = '../data/provas/ENEM_2016.pdf'

# Caminho da prova ENEM 2027
prova_2017 = '../data/provas/ENEM_2017.pdf'

In [None]:
questoes_2015 = get_questions(prova_2015, 1, 15, True)
len(questoes_2015)

In [None]:
questoes_2016 = get_questions(prova_2016, 1, 15, True)
len(questoes_2016)

In [None]:
questoes_2017 = get_questions(prova_2017, 19, 32, True)
len(questoes_2017)

## Alternativas


**2015**

In [None]:
questoes_2015, alts_2015 = get_alternatives(questoes_2015)
alts_2015[:5]

**2016**

In [None]:
questoes_2016, alts_2016 = get_alternatives(questoes_2016)
alts_2016[:5]

**2017**

In [None]:
questoes_2017, alts_2017 = get_alternatives(questoes_2017)
alts_2017[:5]

# Montando o dataset final

In [None]:
df_final = pd.concat([merge_quests_alts(df_2015, questoes_2015, alts_2015),
                      merge_quests_alts(df_2016, questoes_2016, alts_2016),
                      merge_quests_alts(df_2017, questoes_2017, alts_2017)], axis=0)

df_final.to_csv('df_15-17.csv')