# Selectividad Exams Parser

## Libraries import

### Required packages for OCR processing:
**Ubuntu based:**
- tesseract-ocr
- tesseract-ocr-spa
- poppler-utils

**Archlinux based:**
- tesseract
- tesseract-data-spa

In [3]:
# !pip install pdf2image
# !pip install pytesseract

In [4]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
import os
import requests
from datetime import datetime

## Functions

### Telegram bot

Function to generate messages to a telegram bot

In [5]:
# Replace with your bot's token and chat ID
from bot_credentials import *

def send_telegram_message(message):
    url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
    params = {"chat_id": CHAT_ID, "text": message}
    response = requests.get(url, params=params)
    return response.json()

### *pdf_to_text*

Extract the text from pdf files, page by page

In [6]:
def pdf_to_text(file_path):
    """
    Processes a single PDF file to extract exam information and statements.

    Args:
        file (Path): A Path object representing the PDF file to be processed.
    
    Returns:
        dict: A dictionary containing the extracted information with the following structure:
            {
                "key": {
                    "subject": list of subjects,
                    "year": list of years,
                    "topic": list of topics,
                    "exam": list of exams,
                    "exercise": list of exercises,
                    "statement": list of statements
                }
            }

    Workflow:
        1. Extracts the topic from the file name.
        2. Converts the PDF file into text, page by page.
        3. For each page:
            - Extracts the statement using the `extract_statement` function.
            - Extracts exam details using the `extract_exam_details` function.
            - Handles errors by logging them using `generate_error_log`.
        4. Aggregates the extracted information into lists for subjects, years, topics, exams, exercises, and statements.
        5. Returns a dictionary with the extracted data.

    Notes:
        - Logs errors to a file if there are issues with extracting statements or exam details.
        - Skips pages with errors and continues processing the remaining pages.
    """
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### *extract_statement*
Extract the statement from a page in text format

In [7]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')

    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:
        # stop at RESOLUCION
        if line.startswith('RESOLUCIÓN') or line.startswith('RES OLUCION'):
            break
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())
    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

### *extract_exam_details*
Extract the exam details, like subject, year, exam and exercise number.

In [8]:
def extract_exam_details(statement):
    # get last line with exam details
    lines = [i for i in statement.split('\n') if i != '']
    exam_details = lines[-1].lower()

    # strip whitespaces
    exam_details = (exam_details.replace('.', '')            
                    .replace(',', '')
                    .replace('(', '')
                    .replace(')', '').strip()
    )
    
    # split the words by ' '
    exam_details = exam_details.split(' ')

    # MATES CCSS
    # SOCIALES II. 2017 JUNIO. EJERCICIO 2. OPCIÓN A
    # 
    if exam_details[0].startswith('sociales'):
        exam_details[0] = 'MATES_CCSS'
        # remove the second element in the list
        del exam_details[1]
    
       
    # exam dictionary
    details = ['subject', 'year', 'exam', 'exercise']
    exam_dict = dict.fromkeys(details)
    exam_dict = {k:None for k in details}

    ## fill dictionary with values
    
    # subject
    exam_dict['subject'] = exam_details[0]
    
    # year
    # SOCIALES II. PONENCIA 2009. EJERCICIO 2
    if exam_details[1] == 'ponencia':
        # switch ponencia and year
        exam_details[1], exam_details[2] = exam_details[2], exam_details[1]

    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('reserva'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()

    
    return exam_dict

### *time*
Function to measure the time taken to process the files

In [9]:
import time
def what_time():
    now = time.localtime()
    time_formated = time.strftime("%H:%M:%S on %Y-%m-%d ", now)
    return now, time_formated

def elapsed_time_minutes(start, end):
    start_seconds = time.mktime(start)
    end_seconds = time.mktime(end)
    return (end_seconds - start_seconds) / 60

### *generate_error_log*
generate a error log and add it to errors_timestamp.log file

In [10]:
def generate_error_log(file, page, error_type, statement = None, details = None):
    log = f'Error processing {error_type}, in {file.name} at page {page+2}\n'
    
    # add timestamp to log file
    current_datetime = datetime.now()
    formatted_dt = current_datetime.strftime("%d-%m-%Y")
    with open(f'./error_logs/errors_{formatted_dt}.log', 'a') as f:
        f.write(log)
        f.write('*' * 10 + '\n')
        if statement:
            f.write(f'{statement}\n')
        elif details:
            f.write(f'{details}\n')
        f.write('*' * 10 + '\n')
    
    # print log to console without the new line
    print(log[:-2])

### *add_row*
Manually add a row to the processed exercises

In [11]:
def add_row(df, statement, exam_details):
    
    row = extract_exam_details(exam_details)
    row['statement'] = statement

    return pd.concat([df, pd.DataFrame(row, columns = df.columns, index = [0])], axis = 0)

### *process_file*
Process a pdf files, extracting its information: statement and exam information 

In [12]:
def process_file(file):
    exercises_dict = {}
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = pdf_to_text(file)

    # parse each page in the pdf file

    for index, page in enumerate(pages):

        # extract statement from page
        try:
            statement = extract_statement(page)
        # error with extracting statement
        except:
            generate_error_log(file = file, page = index, error_type= 'statement', statement=page)
            continue

        # extract exam details
        try:
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        # error with extracting details
        except:
            # pages ending with www.emestrada.org are pages with solution to exercise
            if not statement.endswith('www.emestrada.org'):
                generate_error_log(file = file, page = index, error_type= 'details', details = statement)
            continue
        
    print(f'Success parsing file: {file.stem}')

    # generate the key and content for the exercises dictionary
    key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
    exercises_dict[key] = {
        'subject' : subjects,
        'year' : years,
        'topic' : [topic] * len(subjects),
        'exam' : exams,
        'exercise' : exercises,
        'statement' : statements
    }

    return exercises_dict

### *create_content_dict*
Creates a python dictionary with the exam details and its statement

In [13]:
def create_content_dict():
    # create the dict from keys
    keys = ['subject', 'year', 'topic', 'exam', 'exercise', 'statement']
    #content = dict.fromkeys(exercises_dict[first_processed_file].keys())
    content = dict.fromkeys(keys)
    # fill the dict with empty lists
    for key in content.keys():
        content[key] = []

    return content

In [14]:
def process_folder(folder):
    df = pd.DataFrame()
    
    # get the list of pdf files within the folder
    files = sorted( [i for i in Path.iterdir(folder)
                        if i.suffix == '.pdf'] )
    
    # process each pdf file to extract its information
    for file in files:
        try:
            exercises_dict = process_file(file)
            # each year contains a dictionary with keys containing lists of values
            for year, dict_ in exercises_dict.items():
                df = pd.concat([df, pd.DataFrame(dict_)], axis = 0)

        except:
            pass
        
    # correct wrong subjects
    #df.subject = df.subject.apply(lambda x: 'Química'
    #                                if x.endswith('mica') else x)
    
    # export the output of the processed pdf files to a csv file
    df.to_csv(f'./csv/exercises_{folder.stem}.csv', index=False)

## Process pdf files

In [30]:
folders = sorted([i for i in Path('./pdf_files/').iterdir() if i.is_dir()])

In [34]:
folders

[PosixPath('pdf_files/FISICA'),
 PosixPath('pdf_files/MATES_CCSS'),
 PosixPath('pdf_files/QUIMICA')]

### Get random files to test parsing

In [27]:
from numpy.random import choice

In [None]:
files = []

for f in folders:
    files.extend( list(f.glob("**/*.pdf")) )

print(*choice(files,5), sep='\n')

pdf_files/QUIMICA/Ácido Base/2004 - Ácido Base.pdf
pdf_files/FISICA/Óptica geométrica/2021 - Óptica geométrica.pdf
pdf_files/QUIMICA/Enlace Químico/2009 - Enlace Químico.pdf
pdf_files/QUIMICA/Reactividad Orgánica/2002 - Reactividad Orgánica.pdf
pdf_files/QUIMICA/Ácido Base/2006 - Ácido Base.pdf


In [41]:
choice(files,5)

array([PosixPath('pdf_files/MATES_CCSS/Matrices y Determinantes/2015 - Matrices Y Determinantes.pdf'),
       PosixPath('pdf_files/MATES_CCSS/Funciones CCSS/2002 - Funciones CCSS.pdf'),
       PosixPath('pdf_files/MATES_CCSS/Probabilidad/2023 - Probabilidad.pdf'),
       PosixPath('pdf_files/QUIMICA/Cantidad Química/2002 - Cantidad Química.pdf'),
       PosixPath('pdf_files/FISICA/Movimiento Ondulatorio/2016 - Ondas.pdf')],
      dtype=object)

In [None]:
sample_files = []
for f in files:
    sample_files.extend(choice(files, 2))

sample_files

In [None]:
start, start_formatted = what_time()
send_telegram_message(f'Parsing process started at: {start_formatted}')

for folder in folders:
    try:
        process_folder(folder)
        print(f'Success processing folder: {folder.stem}')
        send_telegram_message(f'Success processing folder: {folder.stem}')
    except Exception as e:
        send_telegram_message(f'Error processing folder: {folder.stem}' \
                              f'\n{e}')
end, end_formatted = what_time()

send_telegram_message(f'Parsing process finished at: {end_formatted}'\
                      f'\nTime elapsed in minutes: {elapsed_time_minutes(start, end)}')
send_telegram_message('✅ Your files have been processed!')


Error processing details, in 2016 - Campo eléctrico y magnético.pdf at page 1
Success parsing file: 2016 - Campo eléctrico y magnético
Success parsing file: 2017 - Campo eléctrico y magnético
Success parsing file: 2018 - Campo eléctrico y magnético
Success parsing file: 2019 - Campo eléctrico y magnético
Success parsing file: 2020 - Campo eléctrico y magnético
Success parsing file: 2021 - Campo eléctrico y magnético
Success parsing file: 2022 - Campo eléctrico y magnético
Success parsing file: 2023 - Campo eléctrico y magnético
Success parsing file: 2024 - Campo eléctrico y magnético
Success processing folder: Campo eléctrico y magnético
Success parsing file: 2016 - Campo gravitatorio


## Check processing of files

1. Create path to file to process
2. Use `pdf_to_text` to process the pdf file. Store output in `pages`
3. Extract statement using `extract_statement`

In [53]:
file = Path('pdf_files/MATES_CCSS/Contraste de Hipótesis/2016 - Contraste De Hipótesis.pdf')
pages = pdf_to_text(file)

In [54]:
statement = extract_statement(pages[0])
statement.split('\n')[-1]

'SOCIALES II. 2016 RESERVA 1. EJERCICIO 4 OPCIÓN B'

In [55]:
details = extract_exam_details(statement)
details

{'subject': 'MATES_CCSS',
 'year': 2016,
 'exam': 'Reserva 1',
 'exercise': 'Ejercicio 4 Opción B'}

In [29]:
statement.split('\n')[-1]

'SOCIALES II. PONENCIA 2009. EJERCICIO 1'

## Manual Addition of exercises

#### Load csv into a dataframe

In [None]:
df = pdf.read_csv('./csv/exercises_Funciones CCSS.csv')

In [None]:
statement = '''a) Calcule los valores de a y b para que la función f(x)= 2-x , sea
ax”?-3x+1 si x>1
derivable en el punto de abscisa x =1
b) Para a=1 y b=2, estudie su monotonía y determine las ecuaciones de sus asíntotas, si
existen.'''

# SOCIALES II. 2017 JUNIO. EJERCICIO 2. OPCIÓN A
exam_details = 'SOCIALES II. 2016 JUNIO. EJERCICIO 2. OPCIÓN A'

In [None]:
extract_exam_details(exam_details)

{'subject': 'Mates CCSS',
 'year': 2006,
 'exam': 'Junio',
 'exercise': 'Ejercicio 3 Opción A'}

In [19]:
df = add_row(df, statement, exam_details)

NameError: name 'df' is not defined

In [None]:
df.to_csv(f'./csv/', index=False)

## Crete combined csv file for subject

Create the pandas dataframe

In [None]:
csv_files = [i for i in Path.iterdir(Path('./csv'))
             if i.suffix == '.csv' and not i.stem.startswith('all')]

In [None]:
csv_files

In [None]:
df = pd.read_csv(csv_files[0])

In [None]:
# df = pd.read_csv(csv_files[0])
# for file in csv_files[1:]:
#     df = pd.concat([df, pd.read_csv(file)], axis = 0)
#     #df.to_csv('./csv/all_exercises.csv', index=False)

# df.sample(10)

In [None]:
df.isna().sum()

In [None]:
df[~df.year.isna()]

In [None]:
df.shape

In [None]:
df.subject.unique()

In [None]:
df.exercise.unique()

In [None]:
df.exercise = df.exercise.apply(lambda x: x.replace('Opcióon', 'Opción').replace('Opcion', 'Opción'))

In [None]:
def add_comma(string):
    if len(string.split(' ')) == 4 and ',' not in string:
        words = string.split(' ')
        return f'{words[0]} {words[1]}, {words[2]} {words[3]}'
    else:
        return string

In [None]:
df.exercise = df.exercise.apply(add_comma)

In [None]:
df.to_csv(f'./csv/all_exercises_{df.subject.unique()[0]}.csv', index=False)