# Selectividad Exams Parser

## Libraries import

In [127]:
import fitz
import pandas as pd
import numpy as np
from pathlib import Path
import json

## Files management

In [141]:
def find_files(directory, pattern):
    path = Path(directory)
    return path.rglob(pattern)

In [142]:
file_paths = sorted( [i for i in find_files(directory='./pdf_files', pattern='.pdf') 
              if i.name.startswith('2')
              ] )

In [144]:
file_paths

[]

## Parsing function

In [106]:
def extract_info(file_path):
    """
    Extract relevant information from a single PDF file.

    Parameters
    ----------
    file_path : str
        Path to the PDF file

    Returns
    -------
    dict
        A dictionary containing the extracted information. The keys are:
            - subject
            - topic
            - year
            - exam_name
            - exercise
            - statement
            - page
            - exam_details
    """

    # define keys for the content dictionary
    content = {
        'subject': [],
        'topic' : [],
        'year': [],
        'page': [],
        'exam_details': [],
        'exam_name': [],
        'exercise': [],
        'statement': [],
    }

    # open the pdf file to be parsed
    doc = fitz.open(file_path)

    # process individual pages
    for page in doc[1:]:
        
        # get content of the page
        text = page.get_text("text")
        
        ### clean the content of page
        # replacements
        replacements = {'www.emestrada.org' : '',
                        'R  E  S  O  L  U  C  I  Ó  N': 'Resolucion',
                        '\n \n \n' : ''}

        for subs, rep in replacements.items():
            text =  text.replace(subs, rep)
        
        # remove whitespaces and create list of rows
        text = text.strip().replace('  ', ' ').split('\n')
        ###

        # extract exam details
        exam_details = text[-1]

        ### parse the text of the page
        # extract text of the page
        text = text[:-1]

        # extract only exercise statement
        times_a_appeared = 0
        for index, row in enumerate(text):
            if 'a)' in row:
                times_a_appeared += 1
                # it's the second occurrence of a)
                if times_a_appeared == 2:
                    text = text[index:]
                    # remove any text before "a)"
                    text[0] = 'a)' + text[0].split('a)')[1]
                    break
        
        # remove relative masses part
        for index, row in enumerate(text):
            if 'dato' in row.lower() :
                if not row.lower().startswith('dato'):
                    dato_index = row.lower().find('dato')
                    text[index] = row[:dato_index]
                    text = text[:index+1]
                # row starts with "Dato:"
                else:
                    text = text[:index]
                break
        
        # remove the lines starting with just a number
        text = [i for i in text if len(i) > 1]
        
        # join the lines of the statement
        joined_text = ''
        for row in text:

            if ' b) ' in row:
                row = row.split('b)')
                joined_text += row[0] + '\n' + 'b) ' + row[1]
            elif row.startswith('b'):
                joined_text += '\n' + row
            else:
                joined_text += row

        # finally remove double whitespases
        statement = joined_text.replace('  ', ' ')
        ### end of parsing text

        ### extract: year, exam name and exercise

        # create empty values for exam details
        # that will be filled with proper values
        # if the exam details can be parsed
        subject, year, exam_name, exercise = ['-'] * 4

        # parse the exam details
        try:
            exam = exam_details.split(' ')
            # extract subject
            subject = exam[0].replace('.', '').title()
            # extract year
            year = exam[1].replace('.', '')
            # get exam
            if exam[2] == 'RESERVA':
                exam_name = exam[2].title() +  ' ' + ' '.join(exam[3]).replace('.', '').strip()
            else:
                exam_name = exam[2].title().replace('.', '').strip()
            # get exercise
            if exam[2] == 'RESERVA':
                exercise = ' '.join(exam[4:]).title().replace('.', '').strip()
            else:
                exercise = ' '.join(exam[3:]).title().replace('.', '').strip()
        
        # if there were errors, generate error message and log it
        except:
            error_message = f'Failed parsing exercise details: {file_path.name} - page {page.number + 1}'
            print(error_message)
            # add to error log
            with open('errors.txt', 'a') as f:
                f.write(error_message + '\n')      
        
        ### end of parsing exam details

        # update the content dict
        content['subject'].append(subject)
        content['topic'].append(file_path.stem.split(' - ')[-1])
        content['year'].append(year)
        content['exam_name'].append(exam_name)
        content['exercise'].append(exercise)
        content['statement'].append(statement)
        content['page'].append(page.number + 1)
        content['exam_details'].append(exam_details)
        
    return content

## Check exam string

In [4]:
def check_exam_string(exam_details):
    page_dict = {}
    exam = exam_details.split(' ')

    # extract subject
    page_dict['subject'] = (exam[0]
                            .replace('.', '')
                            .title()
    )
    page_dict['year'] = exam[1].replace('.', '')
    # get exam
    if exam[2] == 'RESERVA':
        page_dict['exam'] = exam[2].title() +  ' ' + ' '.join(exam[3])
    else:
        page_dict['exam'] = exam[2].title()
    # get exercise
    if exam[2] == 'RESERVA':
        page_dict['exercise'] = ' '.join(exam[4:]).title()
    else:
        page_dict['exercise'] = ' '.join(exam[3:]).title()

    return page_dict

In [5]:
check_exam_string('QUIMICA. 2005. RESERVA 1. EJERCICIO 5. OPCIÓN A')

{'subject': 'Quimica',
 'year': '2005',
 'exam': 'Reserva 1 .',
 'exercise': 'Ejercicio 5. Opción A'}

## Extract information

In [133]:
exercises = {}

for file in file_paths:
    exam = file.stem
    exercises[exam] = extract_info(file)

In [136]:
exercises

{}

## Write to a JSON file


In [134]:
with open('exercises.json', 'w', encoding='utf-8') as f:
    json.dump(exercises, f, indent=4, ensure_ascii=False)

## Conversion to pandas DataFrame

### Get columns

In [135]:
first_exercise = exercises[ list( exercises.keys() )[0] ]
columns = list(first_exercise.keys())
columns

IndexError: list index out of range

### Create dictionary
Dictionary keys: subject, topic, year, exam_name, exercise, statement, page, exam_details

In [118]:
# initialize the dictionary
content_dict = dict.fromkeys(columns)
for key in content_dict.keys():
    content_dict[key] = []

In [119]:
for exercise, content in exercises.items():
    for key, value in content.items():
        content_dict[key].extend(value)

### Create the DataFrame

In [128]:
exercises_df = pd.DataFrame(content_dict)
# replace '-' with NaN
exercises_df.replace('-', np.nan, inplace=True)

In [129]:
exercises_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1840 entries, 0 to 1839
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subject       1840 non-null   object
 1   topic         1840 non-null   object
 2   year          1812 non-null   object
 3   page          1840 non-null   int64 
 4   exam_details  1840 non-null   object
 5   exam_name     1809 non-null   object
 6   exercise      1809 non-null   object
 7   statement     1840 non-null   object
dtypes: int64(1), object(7)
memory usage: 115.1+ KB


## Export `exercises` to csv

In [None]:
for subject in exercises_df['subject'].unique():
    exercises_df[exercises_df['subject'] == subject].to_csv(f'exercises_{subject}.csv', index=False)