# Selectividad Exams Parser

## Libraries import

In [8]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
import numpy as np
from pathlib import Path
import json
import os

## Files management

In [9]:
def find_files(directory, pattern):
    path = Path(directory)
    return path.rglob(pattern)

In [21]:
file_paths = sorted( [i for i in find_files(directory='./pdf_files',
                                            pattern='*.pdf') 
              if i.name.startswith('2')
              ] )

In [22]:
len(file_paths)

227

## Functions

### Parse pdf pages into images

In [23]:
def parse_pages_ocr(file_path):
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### Extract statement text from page text

In [24]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')
    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:
        # stop at RESOLUCION
        if line.startswith('RESOLUCIÓN'):
            break
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())
    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

### Extract exame datails from statement

In [25]:
def extract_exam_details(statement):
    # get last line with exam details
    exam_details = statement.split('\n')[-1]
    # split the words by ' '
    exam_details = exam_details.split(' ')
    # strip whitespaces
    exam_details = [i.replace('.', '').strip() for i in exam_details]

    # exam dictionary
    exam_dict = {}

    # fill dictionary with values
    exam_dict['subject'] = exam_details[0].title()
    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('RESERVA'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()
    
    return exam_dict

## Process the pdf files

In [None]:
exercises_dict = {}
for file in file_paths:
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = parse_pages_ocr(file)

    try:
        for index, page in enumerate(pages):
            statement = extract_statement(page)
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        
        print(f'Success parsing file: {file.stem}')
    except:
        print(f'Error in {file.name}: page {index+1}')
        continue

    # generate the key and content for the exercises dictionary
    key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
    exercises_dict[key] = {
        'subject' : subjects,
        'year' : years,
        'topic' : [topic] * len(subjects),
        'exam' : exams,
        'exercise' : exercises,
        'statement' : statements
    }

Success parsing file: 2000 - Cantidad Química
Success parsing file: 2001 - Cantidad Química
Success parsing file: 2002 - Cantidad Química
Success parsing file: 2003 - Cantidad Química
Success parsing file: 2004 - Cantidad Química
Success parsing file: 2005 - Cantidad Química
Success parsing file: 2006 - Cantidad Química
Success parsing file: 2007 - Cantidad Química
Success parsing file: 2008 - Cantidad Química
Success parsing file: 2009 - Cantidad Química
Success parsing file: 2010 - Cantidad Química
Success parsing file: 2011 - Cantidad Química
Success parsing file: 2012 - Cantidad Química
Success parsing file: 2013 - Cantidad Química
Success parsing file: 2014 - Cantidad Química
Success parsing file: 2015 - Cantidad Química
Success parsing file: 2000 - Conf. Electrónica
Success parsing file: 2001 - Conf. Electrónica
Success parsing file: 2002 - Conf. Electrónica
Success parsing file: 2003 - Conf. Electrónica
Success parsing file: 2004 - Conf. Electrónica
Success parsing file: 2005 - 

In [15]:
exercises_dict.keys()

dict_keys(['Química 2000 Redox', 'Química 2001 Redox'])

## Save the parsed content into a dataframe

Create a dictionary to store the information

In [16]:
# get dictionary keys from the first processed file
first_processed_file = list(exercises_dict.keys())[0]
# create the dict from keys
content = dict.fromkeys(exercises_dict[first_processed_file].keys())
# fill the dict with empty lists
for key in content.keys():
    content[key] = []
content

{'subject': [],
 'year': [],
 'topic': [],
 'exam': [],
 'exercise': [],
 'statement': []}

Extract values from the dictionary for each subject/year/topic file


In [17]:
# each year contains a dictionary with keys containing lists of values
for year, dict_ in exercises_dict.items():
    # loop through the dictionary
    for key, value in dict_.items():
        content[key].extend(value)

Create the pandas dataframe

In [18]:
df = pd.DataFrame(content)

In [19]:
df.sample(5)

Unnamed: 0,subject,year,topic,exam,exercise,statement
13,Química,2001,Redox,RESERVA 4,Ejercicio 3 Opción A,"Se construye una pila, en condiciones estándar..."
5,Química,2000,Redox,RESERVA 4,Ejercicio 6 Opción B,"Al hacer la electrolisis del cloruro de sodio,..."
14,Química,2001,Redox,RESERVA 4,Ejercicio 6 Opción B,"En medio ácido sulfúrico, el permanganato de p..."
11,Química,2001,Redox,RESERVA 3,Ejercicio 3 Opción A,Dadas las siguientes reacciones (sin ajustar):...
16,Química,2001,Redox,Septiembre,Ejercicio 3 Opción B,"Sabiendo que:\nZa(s) | Zn”(1M) || H*(1M) | H,(..."


## Extract information

## Write to a JSON file


In [134]:
with open('exercises.json', 'w', encoding='utf-8') as f:
    json.dump(exercises, f, indent=4, ensure_ascii=False)

## Export `exercises` to csv

In [None]:
for subject in df['subject'].unique():
    df[df['subject'] == subject].to_csv(f'exercises_{subject}.csv', index=False)