# Selectividad Exams Parser

## Libraries import

In [165]:
import fitz
import pandas as pd
import numpy as np
from pathlib import Path
import json

## Files management

In [166]:
def find_files(directory, pattern):
    path = Path(directory)
    return path.rglob(pattern)

In [167]:
file_paths = sorted( [i for i in find_files(directory='./pdf_files', pattern='*.pdf') 
              if i.name.startswith('2')
              ] )

### Get a sample of the files
Check if the extraction of the pdf paths was successful

In [168]:
print(*np.random.choice(file_paths, 5, replace=False), sep = '\n')

pdf_files/QUÍMICA/Conf. Electrónica/2005 - Conf. Electrónica.pdf
pdf_files/QUÍMICA/Cantidad Química/2004 - Cantidad Química.pdf
pdf_files/QUÍMICA/REDOX/2020 - Reacciones Redox.pdf
pdf_files/QUÍMICA/Solubilidad/2021 - Solubilidad.pdf
pdf_files/QUÍMICA/Reactividad Orgánica/2016 - Reactividad Orgánica.pdf


## Parsing function

In [169]:
def extract_info(file_path):
    """
    Extract relevant information from a single PDF file.

    Parameters
    ----------
    file_path : str
        Path to the PDF file

    Returns
    -------
    dict
        A dictionary containing the extracted information. The keys are:
            - subject
            - topic
            - year
            - exam_name
            - exercise
            - statement
            - page
            - exam_details
    """

    # define keys for the content dictionary
    content = {
        'subject': [],
        'topic' : [],
        'year': [],
        'page': [],
        'exam_details': [],
        'exam_name': [],
        'exercise': [],
        'statement': [],
    }

    # open the pdf file to be parsed
    doc = fitz.open(file_path)

    # process individual pages
    for page in doc[1:]:
        
        # get content of the page
        text = page.get_text("text")
        
        ### clean the content of page
        # replacements
        replacements = {'www.emestrada.org' : '',
                        'R  E  S  O  L  U  C  I  Ó  N': 'Resolucion',
                        '\n \n \n' : ''}

        for subs, rep in replacements.items():
            text =  text.replace(subs, rep)
        
        # remove whitespaces and create list of rows
        text = text.strip().replace('  ', ' ').split('\n')
        ###

        # extract exam details
        exam_details = text[-1]

        ### parse the text of the page
        # extract text of the page
        text = text[:-1]

        # extract only exercise statement
        times_a_appeared = 0
        for index, row in enumerate(text):
            if 'a)' in row:
                times_a_appeared += 1
                # it's the second occurrence of a)
                if times_a_appeared == 2:
                    text = text[index:]
                    # remove any text before "a)"
                    text[0] = 'a)' + text[0].split('a)')[1]
                    break
        
        # remove relative masses part
        for index, row in enumerate(text):
            if 'dato' in row.lower() :
                if not row.lower().startswith('dato'):
                    dato_index = row.lower().find('dato')
                    text[index] = row[:dato_index]
                    text = text[:index+1]
                # row starts with "Dato:"
                else:
                    text = text[:index]
                break
        
        # remove the lines starting with just a number
        text = [i for i in text if len(i) > 1]
        
        # join the lines of the statement
        joined_text = ''
        for row in text:

            if ' b) ' in row:
                row = row.split('b)')
                joined_text += row[0] + '\n' + 'b) ' + row[1]
            elif row.startswith('b'):
                joined_text += '\n' + row
            else:
                joined_text += row

        # finally remove double whitespases
        statement = joined_text.replace('  ', ' ')
        ### end of parsing text

        ### extract: year, exam name and exercise

        # create empty values for exam details
        # that will be filled with proper values
        # if the exam details can be parsed
        subject, year, exam_name, exercise = ['-'] * 4

        # parse the exam details
        try:
            exam = exam_details.split(' ')
            # extract subject
            subject = exam[0].replace('.', '').title()
            # extract year
            year = exam[1].replace('.', '')
            # get exam
            if exam[2] == 'RESERVA':
                exam_name = exam[2].title() +  ' ' + ' '.join(exam[3]).replace('.', '').strip()
            else:
                exam_name = exam[2].title().replace('.', '').strip()
            # get exercise
            if exam[2] == 'RESERVA':
                exercise = ' '.join(exam[4:]).title().replace('.', '').strip()
            else:
                exercise = ' '.join(exam[3:]).title().replace('.', '').strip()
        
        # if there were errors, generate error message and log it
        except:
            error_message = f'Failed parsing exercise details: {file_path.name} - page {page.number + 1}'
            print(error_message)
            # add to error log
            with open('errors.txt', 'a') as f:
                f.write(error_message + '\n')      
        
        ### end of parsing exam details

        # update the content dict
        content['subject'].append(subject)
        content['topic'].append(file_path.stem.split(' - ')[-1])
        content['year'].append(year)
        content['exam_name'].append(exam_name)
        content['exercise'].append(exercise)
        content['statement'].append(statement)
        content['page'].append(page.number + 1)
        content['exam_details'].append(exam_details)
        
    return content

## Check exam string

Function to check exam details strings for exercises whose information couldn't be parsed correctly.

In [170]:
def check_exam_string(exam_details):
    page_dict = {}
    exam = exam_details.split(' ')

    # extract subject
    page_dict['subject'] = (exam[0]
                            .replace('.', '')
                            .title()
    )
    page_dict['year'] = exam[1].replace('.', '')
    # get exam
    if exam[2] == 'RESERVA':
        page_dict['exam'] = exam[2].title() +  ' ' + ' '.join(exam[3])
    else:
        page_dict['exam'] = exam[2].title()
    # get exercise
    if exam[2] == 'RESERVA':
        page_dict['exercise'] = ' '.join(exam[4:]).title()
    else:
        page_dict['exercise'] = ' '.join(exam[3:]).title()

    return page_dict

In [171]:
check_exam_string('QUIMICA. 2005. RESERVA 1. EJERCICIO 5. OPCIÓN A')

{'subject': 'Quimica',
 'year': '2005',
 'exam': 'Reserva 1 .',
 'exercise': 'Ejercicio 5. Opción A'}

## Extract information

In [172]:
exercises = {}

for file in file_paths:
    exam = file.stem
    exercises[exam] = extract_info(file)

Failed parsing exercise details: 2000 - Enlace Químico.pdf - page 3
Failed parsing exercise details: 2004 - Enlace Químico.pdf - page 6
Failed parsing exercise details: 2005 - Enlace Químico.pdf - page 4
Failed parsing exercise details: 2006 - Enlace Químico.pdf - page 2
Failed parsing exercise details: 2006 - Enlace Químico.pdf - page 3
Failed parsing exercise details: 2006 - Enlace Químico.pdf - page 4
Failed parsing exercise details: 2006 - Enlace Químico.pdf - page 6
Failed parsing exercise details: 2010 - Enlace Químico.pdf - page 2
Failed parsing exercise details: 2011 - Enlace Químico.pdf - page 4
Failed parsing exercise details: 2013 - Enlace Químico.pdf - page 4
Failed parsing exercise details: 2021 - Enlace Químico.pdf - page 4
Failed parsing exercise details: 2021 - Enlace Químico.pdf - page 5
Failed parsing exercise details: 2001 - Equilibrio Químico.pdf - page 10
Failed parsing exercise details: 2018 - Equilibrio Químico.pdf - page 6
Failed parsing exercise details: 2022 -

## Write to a JSON file


In [173]:
with open('exercises.json', 'w', encoding='utf-8') as f:
    json.dump(exercises, f, indent=4, ensure_ascii=False)

## Conversion to pandas DataFrame

### Get columns

In [174]:
first_exercise = exercises[ list( exercises.keys() )[0] ]
columns = list(first_exercise.keys())
columns

['subject',
 'topic',
 'year',
 'page',
 'exam_details',
 'exam_name',
 'exercise',
 'statement']

### Create dictionary
Dictionary keys: subject, topic, year, exam_name, exercise, statement, page, exam_details

In [175]:
# initialize the dictionary
content_dict = dict.fromkeys(columns)
for key in content_dict.keys():
    content_dict[key] = []

In [176]:
for exercise, content in exercises.items():
    for key, value in content.items():
        content_dict[key].extend(value)

### Create the DataFrame

In [177]:
exercises_df = pd.DataFrame(content_dict)
# replace '-' with NaN
exercises_df.replace('-', np.nan, inplace=True)

In [178]:
exercises_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subject       1464 non-null   object
 1   topic         1464 non-null   object
 2   year          1447 non-null   object
 3   page          1464 non-null   int64 
 4   exam_details  1464 non-null   object
 5   exam_name     1444 non-null   object
 6   exercise      1444 non-null   object
 7   statement     1464 non-null   object
dtypes: int64(1), object(7)
memory usage: 91.6+ KB


In [179]:
exercises_df.sample(10)

Unnamed: 0,subject,topic,year,page,exam_details,exam_name,exercise,statement
546,Química,Equilibrio Químico,2011,3,QUÍMICA. 2011. JUNIO. EJERCICIO 6. OPCIÓN B,Junio,Ejercicio 6 Opción B,a) La concentración de cada especie en el equi...
1226,Química,Ácido Base,2000,8,QUÍMICA. 2000. RESERVA 3. EJERCICIO 3. OPCIÓN B,Reserva 3,Ejercicio 3 Opción B,a) ¿Cuál es la concentración de iones OH?. \n...
323,Química,Enlace Químico,2002,2,QUÍMICA. 2002. RESERVA 1. EJERCICIO 5. OPCIÓN A,Reserva 1,Ejercicio 5 Opción A,a) Represente el ciclo de Born-Haber para el f...
894,Química,Reactividad Orgánica,2004,2,QUÍMICA. 2004. JUNIO. EJERCICIO 4. OPCIÓN B,Junio,Ejercicio 4 Opción B,a) Isomería de función. \nb) Isomería de posic...
1258,Química,Ácido Base,2003,11,QUÍMICA. 2003. SEPTIEMBRE. EJERCICIO 4. OPCIÓN B,Septiembre,Ejercicio 4 Opción B,a) Escriba sus reacciones de disociación en ag...
250,Química,Conf. Electrónica,2017,6,QUÍMICA. 2017. RESERVA 3. EJERCICIO 2. OPCIÓN B,Reserva 3,Ejercicio 2 Opción B,a) Escriba las configuraciones electrónicas de...
1122,Química,Termoquímica,2003,4,QUÍMICA. 2003. RESERVA 2. EJERCICIO 3. OPCIÓN B,Reserva 2,Ejercicio 3 Opción B,"a) Disolución de nitrato de potasio, KNO , en ..."
1113,Química,Termoquímica,2001,8,QUÍMICA. 2001. SEPTIEMBRE. EJERCICIO 5. OPCIÓN B,Septiembre,Ejercicio 5 Opción B,"a) ¿Cuál será el calor, a presión constante de..."
1088,Química,Solubilidad,2022,5,QUÍMICA. 2022. RESERVA 1. EJERCICIO C2,Reserva 1,Ejercicio C2,a) La solubilidad en mg/L del AgCl en agua. \n...
172,Química,Conf. Electrónica,2004,5,QUÍMICA. 2004. RESERVA 3. EJERCICIO 2. OPCIÓN A,Reserva 3,Ejercicio 2 Opción A,a) Escriba la configuración electrónica de cad...


In [188]:
exercises_df[~exercises_df.subject.isin(['Química', 'Quimica'])][['year', 'page']]

Unnamed: 0,year,page
313,F,3
338,,6
341,,4
343,,2
344,,3
345,,4
347,,6
363,F,2
370,,4
380,,4


## Export `exercises` to csv

In [181]:
for subject in exercises_df['subject'].unique():
    exercises_df[exercises_df['subject'] == subject].to_csv(f'exercises_{subject}.csv', index=False)