# Selectividad Exams Parser

## Libraries import

In [8]:
# !sudo apt update
# !sudo apt install tesseract-ocr tesseract-ocr-spa poppler-utils

In [9]:
# !pip install pdf2image
# !pip install pytesseract

In [10]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from pathlib import Path

## Functions

### Parse pdf pages into images

In [11]:
def parse_pages_ocr(file_path):
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### Extract statement text from page text

In [12]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')
    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:
        # stop at RESOLUCION
        if line.startswith('RESOLUCIÓN'):
            break
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())
    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

### Extract exam datails from statement

In [13]:
def extract_exam_details(statement):
    # get last line with exam details
    lines = [i for i in statement.split('\n') if i != '']
    exam_details = lines[-1]
    # split the words by ' '
    exam_details = exam_details.split(' ')
    # strip whitespaces
    exam_details = [i.replace('.', '').replace(',', '').strip() for i in exam_details]

    # exam dictionary
    exam_dict = {}

    # fill dictionary with values
    exam_dict['subject'] = exam_details[0].title()
    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('RESERVA'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()
    
    return exam_dict

## Process the pdf files

In [14]:
exercises_dict = {}

In [15]:
def process(file, dict_ = exercises_dict):
    
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = parse_pages_ocr(file)

    try:
        for index, page in enumerate(pages):
            statement = extract_statement(page)
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        
        print(f'Success parsing file: {file.stem}')
    except:
        # safe errors to log file
        with open('errors.log', 'a') as f:
            string = f'Error in {file.name} at page {index+2}.\n'
            f.write(string)
            
            print('\n' + '*' * 10)
            print(f'Error in {file.name} at page {index+2}')
            print('*' * 10 + '\n')
            return

    # generate the key and content for the exercises dictionary
    key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
    dict_[key] = {
        'subject' : subjects,
        'year' : years,
        'topic' : [topic] * len(subjects),
        'exam' : exams,
        'exercise' : exercises,
        'statement' : statements
    }

## Save the parsed content into a dataframe

In [24]:
def create_content_dict():
    # create the dict from keys
    keys = ['subject', 'year', 'topic', 'exam', 'exercise', 'statement']
    #content = dict.fromkeys(exercises_dict[first_processed_file].keys())
    content = dict.fromkeys(keys)
    # fill the dict with empty lists
    for key in content.keys():
        content[key] = []

    return content

In [17]:
folders = sorted( [i for i in Path.iterdir(Path('pdf_files'))] )

In [None]:
for folder in folders:
    files = sorted( [i for i in Path.iterdir(folder) if i.suffix == '.pdf'] )
    for file in files:
        process(file, dict_= exercises_dict)
        # each year contains a dictionary with keys containing lists of values
        for year, dict_ in exercises_dict.items():
            # loop through the dictionary
            content = create_content_dict()
            for key, value in dict_.items():
                content[key].extend(value)
    df = pd.DataFrame(content)
    # correct wrong subjects
    df.subject = df.subject.apply(lambda x: 'Química' if x.endswith('mica') else x)
    df.to_csv(f'./csv/exercises_{folder.stem}.csv', index=False)

Extract values from the dictionary for each subject/year/topic file


Create the pandas dataframe

In [30]:
csv_files = [i for i in Path.iterdir(Path('./csv')) if i.suffix == '.csv']

In [None]:
df = pd.read_csv(csv_files[0])
for file in csv_files[1:]:
    df = pd.concat([df, pd.read_csv(file)], axis = 0)
    df.to_csv('./csv/all_exercises.csv', index=False)
df

Unnamed: 0,subject,year,topic,exam,exercise,statement
0,Química,2000,Ácido Base,Junio,Ejercicio 4 Opción A,Complete los siguientes equilibrios ácido-base...
1,Química,2000,Ácido Base,Junio,Ejercicio 6 Opción B,"a) Calcule los gramos del ácido acético CH,COO..."
2,Química,2000,Ácido Base,Reserva 1,Ejercicio 3 Opción A,La fenolftaleína es un indicador ácido-base qu...
3,Química,2000,Ácido Base,Reserva 1,Ejercicio 6 Opción B,"A 15 g de ácido acético (CH,COOH) se añade la ..."
4,Química,2000,Ácido Base,Reserva 2,Ejercicio 3 Opción A,a) Escriba el equilibrio de hidrólisis del ión...
...,...,...,...,...,...,...
133448,Química,2023,Solubilidad,Reserva 3,Ejercicio B5,Razone si las siguientes afirmaciones son verd...
133449,Química,2023,Solubilidad,Reserva 3,Ejercicio C2,"A una temperatura determinada, el producto de ..."
133450,Química,2023,Solubilidad,Reserva 4,Ejercicio C2,"A una temperatura determinada, la solubilidad ..."
133451,Química,2023,Solubilidad,Julio,Ejercicio C2,Basándose en las reacciones químicas correspon...


In [26]:
df = df.drop_duplicates()

In [27]:
df.to_csv('all_exercises.csv', index=False)