# Selectividad Exams Parser

## Libraries import

### Required packages for OCR processing:
**Ubuntu based:**
- tesseract-ocr
- tesseract-ocr-spa
- poppler-utils

**Archlinux based:**
- tesseract
- tesseract-data-spa

In [None]:
# !pip install pdf2image
# !pip install pytesseract

In [1]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
import os
import requests

## Functions

### Telegram bot

In [2]:
# Replace with your bot's token and chat ID
from bot_credentials import *

def send_telegram_message(message):
    url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
    params = {"chat_id": CHAT_ID, "text": message}
    response = requests.get(url, params=params)
    return response.json()

### Parse pdf pages into images

In [3]:
def pdf_to_text(file_path):
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### Extract statement text from page text

In [4]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')
    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:
        # stop at RESOLUCION
        if line.startswith('RESOLUCIÓN'):
            break
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())
    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

### Extract exam datails from statement

In [5]:
def extract_exam_details(statement):
    # get last line with exam details
    lines = [i for i in statement.split('\n') if i != '']
    exam_details = lines[-1]
    # split the words by ' '
    exam_details = exam_details.split(' ')
    # strip whitespaces
    exam_details = [i.replace('.', '')
                    .replace(',', '').strip()
                    for i in exam_details]

    # exam dictionary
    exam_dict = {}

    # fill dictionary with values
    exam_dict['subject'] = exam_details[0].title()
    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('RESERVA'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()
    
    return exam_dict

In [6]:
import time
def what_time():
    now = time.localtime()
    time_formated = time.strftime("%H:%M:%S on %Y-%m-%d ", now)
    return now, time_formated

def elapsed_time_minutes(start, end):
    start_seconds = time.mktime(start)
    end_seconds = time.mktime(end)
    return (end_seconds - start_seconds) / 60

## Process the pdf files

In [7]:
def process_file(file):
    exercises_dict = {}
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = pdf_to_text(file)

    try:
        for index, page in enumerate(pages):
            statement = extract_statement(page)
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        
        print(f'Success parsing file: {file.stem}')
    except:
        # safe errors to log file
        with open('errors.log', 'a') as f:
            string = f'Error in {file.name} at page {index+2}.\n'
            f.write(string)
            
            print('\n' + '*' * 10)
            print(f'Error in {file.name} at page {index+2}')
            print('*' * 10 + '\n')

            send_telegram_message(f'Error in {file.name} at page {index+2}')
            return

    # generate the key and content for the exercises dictionary
    key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
    exercises_dict[key] = {
        'subject' : subjects,
        'year' : years,
        'topic' : [topic] * len(subjects),
        'exam' : exams,
        'exercise' : exercises,
        'statement' : statements
    }

    return exercises_dict

## Save the parsed content into a dataframe

In [8]:
def create_content_dict():
    # create the dict from keys
    keys = ['subject', 'year', 'topic', 'exam', 'exercise', 'statement']
    #content = dict.fromkeys(exercises_dict[first_processed_file].keys())
    content = dict.fromkeys(keys)
    # fill the dict with empty lists
    for key in content.keys():
        content[key] = []

    return content

In [9]:
folders = sorted( [i for i in Path.iterdir(Path('pdf_files'))] )

In [10]:
def process_folder(folder):
    df = pd.DataFrame()
    files = sorted( [i for i in Path.iterdir(folder)
                        if i.suffix == '.pdf'] )
    for file in files:
        try:
            exercises_dict = process_file(file)
            # each year contains a dictionary with keys containing lists of values
            for year, dict_ in exercises_dict.items():
                df = pd.concat([df, pd.DataFrame(dict_)], axis = 0)

        except:
            pass
        
    # correct wrong subjects
    df.subject = df.subject.apply(lambda x: 'Química'
                                    if x.endswith('mica') else x)
    df.to_csv(f'./csv/exercises_{folder.stem}.csv', index=False)

In [11]:
# start, start_formatted = what_time()
# send_telegram_message(f'Parsing process started at: {start_formatted}')

# for folder in folders:
#     try:
#         process_folder(folder)
#         print(f'Success processing folder: {folder.stem}')
#         send_telegram_message(f'Success processing folder: {folder.stem}')
#     except Exception as e:
#         send_telegram_message(f'Error processing folder: {folder.stem}' \
#                               f'\n{e}')
# end, end_formatted = what_time()

# send_telegram_message(f'Parsing process finished at: {end_formatted}'\
#                       f'\nTime elapsed in minutes: {elapsed_time_minutes(start, end)}')
# send_telegram_message('✅ Your files have been processed!')


In [None]:
# # Create a pool of worker processes (adjust number as needed)
# with Pool(processes=os.cpu_count()) as pool:
#     pool.map(process_folder, folders)

Create the pandas dataframe

In [42]:
csv_files = [i for i in Path.iterdir(Path('./csv'))
             if i.suffix == '.csv' and not i.stem.startswith('all')]

In [43]:
df = pd.read_csv(csv_files[0])
for file in csv_files[1:]:
    df = pd.concat([df, pd.read_csv(file)], axis = 0)
    #df.to_csv('./csv/all_exercises.csv', index=False)

df.sample(10)

Unnamed: 0,subject,year,topic,exam,exercise,statement
116,Química,2012,Ácido Base,Reserva 3,Ejercicio 4 Opción B,Clasifique según la teoría de Brónsted —Lowry ...
64,Química,2012,Enlace Químico,Reserva 4,Ejercicio 3 Opción B,"En las siguientes moléculas, H,S ; N, y CH,OH:..."
1,Química,2000,Reacciones Redox,Reserva 1,Ejercicio 5 Opción B,Dada la reacción redox en disolución acuosa:\n...
163,Química,2017,Ácido Base,Reserva 4,Ejercicio 5 Opción B,250 mL de una disolución acuosa contiene 3 g d...
51,Química,2019,Solubilidad,Reserva 2,Ejercicio 3 Opción B,Se dispone de una disolución acuosa saturada d...
72,Química,2007,Formulación,Junio,Ejercicio 1 Opción A,Formule o nombre los compuestos siguientes: a)...
29,Química,2004,Conf. Electrónica,Septiembre,Ejercicio 2 Opcion A,"Considere la serie de elementos: Li, Na, K, Rb..."
165,Química,2024,Conf. Electrónica,Junio,Ejercicio B1,"Dados los iones F - y 0”, justifique la veraci..."
202,Química,2021,Ácido Base,Reserva 4,Ejercicio C3,a) ¿Qué masa de NaOH hay que añadir a 500 mL d...
30,Química,2005,Reactividad Orgánica,Reserva 3,Ejercicio 4 Opcióon A,"Considere las siguientes moléculas:\nCH,CHOHCH..."


In [45]:
df.shape

(1682, 6)

In [46]:
df.subject.unique()

array(['Química'], dtype=object)

In [51]:
df.exercise.unique()

array(['Ejercicio 3, Opción B', 'Ejercicio 5, Opción A',
       'Ejercicio 4, Opción B', 'Ejercicio 6, Opción B',
       'Ejercicio 6, Opción A', 'Ejercicio 4, Opción A',
       'Ejercicio 3, Opción A', 'Ejercicio 5, Opción B', 'B2', 'C1',
       'Ejercicio C1', 'Ejercicio B2', 'B6', 'Ejercicio B6',
       'Ejercicio 2, Opción A', 'Ejercicio 2, Opción B', 'B5', 'C3',
       'Ejercicio B4', 'Ejercicio C3', 'Ejercicio B5', 'B1', 'B4',
       'Ejercicio B1', 'Ejercicio C2', 'Ejercicio C4', 'B3',
       'Ejercicio B3', 'Ejercicio 1, Opción A', 'Ejercicio 1, Opción B',
       'Ejercicio A1', 'Ejercicio A2', 'C4', 'C2', 'Ejercicio Bs'],
      dtype=object)

In [48]:
df.exercise = df.exercise.apply(lambda x: x.replace('Opcióon', 'Opción').replace('Opcion', 'Opción'))

In [49]:
def add_comma(string):
    if len(string.split(' ')) == 4 and ',' not in string:
        words = string.split(' ')
        return f'{words[0]} {words[1]}, {words[2]} {words[3]}'
    else:
        return string

In [50]:
df.exercise = df.exercise.apply(add_comma)

In [52]:
df.to_csv(f'./csv/all_exercises_{df.subject.unique()[0]}.csv', index=False)