# Selectividad Exams Parser

## Libraries import

In [1]:
# !sudo apt update
# !sudo apt install tesseract-ocr tesseract-ocr-spa poppler-utils

In [2]:
# !pip install pdf2image
# !pip install pytesseract

In [3]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
import os
import requests

## Functions

### Telegram bot

In [4]:
# Replace with your bot's token and chat ID
from bot_credentials import *

def send_telegram_message(message):
    url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
    params = {"chat_id": CHAT_ID, "text": message}
    response = requests.get(url, params=params)
    return response.json()

### Parse pdf pages into images

In [5]:
def pdf_to_text(file_path):
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### Extract statement text from page text

In [6]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')
    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:
        # stop at RESOLUCION
        if line.startswith('RESOLUCIÓN'):
            break
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())
    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

### Extract exam datails from statement

In [7]:
def extract_exam_details(statement):
    # get last line with exam details
    lines = [i for i in statement.split('\n') if i != '']
    exam_details = lines[-1]
    # split the words by ' '
    exam_details = exam_details.split(' ')
    # strip whitespaces
    exam_details = [i.replace('.', '')
                    .replace(',', '').strip()
                    for i in exam_details]

    # exam dictionary
    exam_dict = {}

    # fill dictionary with values
    exam_dict['subject'] = exam_details[0].title()
    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('RESERVA'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()
    
    return exam_dict

## Process the pdf files

In [8]:
def process_file(file):
    exercises_dict = {}
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = pdf_to_text(file)

    try:
        for index, page in enumerate(pages):
            statement = extract_statement(page)
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        
        print(f'Success parsing file: {file.stem}')
    except:
        # safe errors to log file
        with open('errors.log', 'a') as f:
            string = f'Error in {file.name} at page {index+2}.\n'
            f.write(string)
            
            print('\n' + '*' * 10)
            print(f'Error in {file.name} at page {index+2}')
            print('*' * 10 + '\n')

            send_telegram_message(f'Error in {file.name} at page {index+2}')
            return

    # generate the key and content for the exercises dictionary
    key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
    exercises_dict[key] = {
        'subject' : subjects,
        'year' : years,
        'topic' : [topic] * len(subjects),
        'exam' : exams,
        'exercise' : exercises,
        'statement' : statements
    }

    return exercises_dict

## Save the parsed content into a dataframe

In [9]:
def create_content_dict():
    # create the dict from keys
    keys = ['subject', 'year', 'topic', 'exam', 'exercise', 'statement']
    #content = dict.fromkeys(exercises_dict[first_processed_file].keys())
    content = dict.fromkeys(keys)
    # fill the dict with empty lists
    for key in content.keys():
        content[key] = []

    return content

In [10]:
folders = sorted( [i for i in Path.iterdir(Path('pdf_files'))] )

In [11]:
def process_folder(folder):
    df = pd.DataFrame()
    files = sorted( [i for i in Path.iterdir(folder)
                        if i.suffix == '.pdf'] )
    for file in files:
        try:
            exercises_dict = process_file(file)
            # each year contains a dictionary with keys containing lists of values
            for year, dict_ in exercises_dict.items():
                df = pd.concat([df, pd.DataFrame(dict_)], axis = 0)

        except:
            pass
        
    # correct wrong subjects
    df.subject = df.subject.apply(lambda x: 'Química'
                                    if x.endswith('mica') else x)
    df.to_csv(f'./csv/exercises_{folder.stem}.csv', index=False)

In [15]:
import time
def what_time():
    now = time.localtime()
    time_formated = time.strftime("%Y-%m-%d %H:%M:%S", now)
    return time, time_formated

In [None]:
start, start_formatted = what_time()
send_telegram_message(f'Parsing process started at: {start_formatted}')

for folder in folders:
    try:
        process_folder(folder)
        print(f'Success processing folder: {folder.stem}')
        send_telegram_message(f'Success processing folder: {folder.stem}')
    except Exception as e:
        send_telegram_message(f'Error processing folder: {folder.stem}' \
                              f'\n{e}')
end, end_formatted = what_time()

send_telegram_message(f'Parsing process finished at: {end_formatted}'\
                      f'\nTime elapsed in minutes: {(end - start) / 60}')
send_telegram_message('✅ Your files have been processed!')


Success parsing file: 2000 - Ácido Base
Success parsing file: 2001 - Ácido Base
Success parsing file: 2002 - Ácido Base
Success parsing file: 2003 - Ácido Base
Success parsing file: 2004 - Ácido Base
Success parsing file: 2005 - Ácido Base
Success parsing file: 2006 - Ácido Base
Success parsing file: 2007 - Ácido Base
Success parsing file: 2008 - Ácido Base

**********
Error in 2009 - Ácido Base.pdf at page 3
**********

Success parsing file: 2010 - Ácido Base
Success parsing file: 2011 - Ácido Base
Success parsing file: 2012 - Ácido Base
Success parsing file: 2013 - Ácido Base
Success parsing file: 2014 - Ácido Base
Success parsing file: 2015 - Ácido Base
Success parsing file: 2016 - Ácido Base
Success parsing file: 2017 - Ácido Base
Success parsing file: 2018 - Ácido Base
Success parsing file: 2019 - Ácido Base
Success parsing file: 2020 - Ácido Base
Success parsing file: 2021 - Ácido Base
Success parsing file: 2022 - Ácido Base
Success parsing file: 2023 - Ácido Base
Success parsing

OSError: Cannot save file into a non-existent directory: 'csv'

In [None]:
# # Create a pool of worker processes (adjust number as needed)
# with Pool(processes=os.cpu_count()) as pool:
#     pool.map(process_folder, folders)

Create the pandas dataframe

In [None]:
csv_files = [i for i in Path.iterdir(Path('./csv')) if i.suffix == '.csv']

In [None]:
df = pd.read_csv(csv_files[0])
for file in csv_files[1:]:
    df = pd.concat([df, pd.read_csv(file)], axis = 0)
    df.to_csv('./csv/all_exercises.csv', index=False)

df.sample(10)

In [None]:
df.to_csv('all_exercises.csv', index=False)