# Selectividad Exams Parser

## Libraries import

In [3]:
# !sudo apt update
# !sudo apt install tesseract-ocr tesseract-ocr-spa poppler-utils

In [4]:
# !pip install pdf2image
# !pip install pytesseract

In [5]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from pathlib import Path

## Functions

### Parse pdf pages into images

In [7]:
def parse_pages_ocr(file_path):
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### Extract statement text from page text

In [8]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')
    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:
        # stop at RESOLUCION
        if line.startswith('RESOLUCIÓN'):
            break
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())
    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

### Extract exam datails from statement

In [9]:
def extract_exam_details(statement):
    # get last line with exam details
    lines = [i for i in statement.split('\n') if i != '']
    exam_details = lines[-1]
    # split the words by ' '
    exam_details = exam_details.split(' ')
    # strip whitespaces
    exam_details = [i.replace('.', '').replace(',', '').strip() for i in exam_details]

    # exam dictionary
    exam_dict = {}

    # fill dictionary with values
    exam_dict['subject'] = exam_details[0].title()
    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('RESERVA'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()
    
    return exam_dict

## Process the pdf files

In [10]:
exercises_dict = {}

In [11]:
def process(file, dict_ = exercises_dict):
    
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = parse_pages_ocr(file)

    try:
        for index, page in enumerate(pages):
            statement = extract_statement(page)
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        
        print(f'Success parsing file: {file.stem}')
    except:
        # safe errors to log file
        with open('errors.log', 'a') as f:
            string = f'Error in {file.name} at page {index+2}.\n'
            f.write(string)
            
            print('\n' + '*' * 10)
            print(f'Error in {file.name} at page {index+2}')
            print('*' * 10 + '\n')
            return

    # generate the key and content for the exercises dictionary
    key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
    dict_[key] = {
        'subject' : subjects,
        'year' : years,
        'topic' : [topic] * len(subjects),
        'exam' : exams,
        'exercise' : exercises,
        'statement' : statements
    }

## Save the parsed content into a dataframe

In [16]:
# get dictionary keys from the first processed file
#first_processed_file = list(exercises_dict.keys())[0]
# create the dict from keys
keys = ['subject', 'year', 'topic', 'exam', 'exercise', 'statement']
#content = dict.fromkeys(exercises_dict[first_processed_file].keys())
content = dict.fromkeys(keys)
# fill the dict with empty lists
for key in content.keys():
    content[key] = []
content

{'subject': [],
 'year': [],
 'topic': [],
 'exam': [],
 'exercise': [],
 'statement': []}

In [14]:
folders = sorted( [i for i in Path.iterdir(Path('pdf_files'))] )

In [17]:
exercises_dict = {}

In [None]:
for folder in folders:
    files = sorted( [i for i in Path.iterdir(folder) if i.suffix == '.pdf'] )
    for file in files:
        process(file, dict_= exercises_dict)
        # each year contains a dictionary with keys containing lists of values
        for year, dict_ in exercises_dict.items():
            # loop through the dictionary
            for key, value in dict_.items():
                content[key].extend(value)

Success parsing file: 2000 - Cantidad Química
Success parsing file: 2001 - Cantidad Química
Success parsing file: 2002 - Cantidad Química
Success parsing file: 2003 - Cantidad Química
Success parsing file: 2004 - Cantidad Química
Success parsing file: 2005 - Cantidad Química
Success parsing file: 2006 - Cantidad Química
Success parsing file: 2007 - Cantidad Química
Success parsing file: 2008 - Cantidad Química
Success parsing file: 2009 - Cantidad Química
Success parsing file: 2010 - Cantidad Química
Success parsing file: 2011 - Cantidad Química
Success parsing file: 2012 - Cantidad Química
Success parsing file: 2013 - Cantidad Química
Success parsing file: 2014 - Cantidad Química
Success parsing file: 2015 - Cantidad Química
Success parsing file: 2000 - Conf. Electrónica
Success parsing file: 2001 - Conf. Electrónica
Success parsing file: 2002 - Conf. Electrónica
Success parsing file: 2003 - Conf. Electrónica
Success parsing file: 2004 - Conf. Electrónica
Success parsing file: 2005 - 

Extract values from the dictionary for each subject/year/topic file


In [None]:
# each year contains a dictionary with keys containing lists of values
for year, dict_ in exercises_dict.items():
    # loop through the dictionary
    for key, value in dict_.items():
        content[key].extend(value)

Create the pandas dataframe

In [None]:
df = pd.DataFrame(content)

In [None]:
df.shape

In [None]:
df.sample(5)

In [None]:
# correct wrong subjects
df.subject = df.subject.apply(lambda x: 'Química' if x.endswith('mica') else x)

## Export `exercises` to csv

In [None]:
for subject in df['subject'].unique():
    df[df['subject'] == subject].to_csv(f'exercises_{subject}.csv', index=False)