# Selectividad Exams Parser

## Libraries import

In [None]:
# !sudo apt update
# !sudo apt install tesseract-ocr tesseract-ocr-spa poppler-utils

In [None]:
# !pip install pdf2image
# !pip install pytesseract

In [None]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from pathlib import Path

## Files management

In [None]:
def find_files(directory, pattern):
    path = Path(directory)
    return path.rglob(pattern)

In [None]:
file_paths = sorted( [i for i in find_files(directory='./pdf_files',
                                            pattern='*.pdf') 
              if i.name.startswith('2')
              ] )

In [None]:
len(file_paths)

## Functions

### Parse pdf pages into images

In [None]:
def parse_pages_ocr(file_path):
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### Extract statement text from page text

In [None]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')
    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:
        # stop at RESOLUCION
        if line.startswith('RESOLUCIÓN'):
            break
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())
    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

### Extract exam datails from statement

In [None]:
def extract_exam_details(statement):
    # get last line with exam details
    lines = [i for i in statement.split('\n') if i != '']
    exam_details = lines[-1]
    # split the words by ' '
    exam_details = exam_details.split(' ')
    # strip whitespaces
    exam_details = [i.replace('.', '').strip() for i in exam_details]

    # exam dictionary
    exam_dict = {}

    # fill dictionary with values
    exam_dict['subject'] = exam_details[0].title()
    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('RESERVA'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()
    
    return exam_dict

## Process the pdf files

In [None]:
exercises_dict = {}
for file in file_paths:
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = parse_pages_ocr(file)

    try:
        for index, page in enumerate(pages):
            statement = extract_statement(page)
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        
        print(f'Success parsing file: {file.stem}')
    except:
        # safe errors to log file
        with open('errors.log', 'a') as f:
            string = f'Error in {file.name}: page {index+2}\n'
            f.write(string)
            print(string)
        
        continue

    # generate the key and content for the exercises dictionary
    key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
    exercises_dict[key] = {
        'subject' : subjects,
        'year' : years,
        'topic' : [topic] * len(subjects),
        'exam' : exams,
        'exercise' : exercises,
        'statement' : statements
    }

## Save the parsed content into a dataframe

Create a dictionary to store the information

In [None]:
# get dictionary keys from the first processed file
first_processed_file = list(exercises_dict.keys())[0]
# create the dict from keys
content = dict.fromkeys(exercises_dict[first_processed_file].keys())
# fill the dict with empty lists
for key in content.keys():
    content[key] = []
content

Extract values from the dictionary for each subject/year/topic file


In [None]:
# each year contains a dictionary with keys containing lists of values
for year, dict_ in exercises_dict.items():
    # loop through the dictionary
    for key, value in dict_.items():
        content[key].extend(value)

Create the pandas dataframe

In [None]:
df = pd.DataFrame(content)

In [None]:
df.shape

In [None]:
df.sample(5)

In [None]:
# correct wrong subjects
df.subject = df.subject.apply(lambda x: 'Química' if x.endswith('mica') else x)

## Export `exercises` to csv

In [None]:
for subject in df['subject'].unique():
    df[df['subject'] == subject].to_csv(f'exercises_{subject}.csv', index=False)