# Selectividad Exams Parser

## Libraries import

### Required packages for OCR processing:
**Ubuntu based:**
- tesseract-ocr
- tesseract-ocr-spa
- poppler-utils

**Archlinux based:**
- tesseract
- tesseract-data-spa

In [1]:
# !pip install pdf2image
# !pip install pytesseract

In [2]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
import os
import requests
from datetime import datetime

## Functions

### Telegram bot

Function to generate messages to a telegram bot

In [3]:
# Replace with your bot's token and chat ID
from bot_credentials import *

def send_telegram_message(message):
    url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
    params = {"chat_id": CHAT_ID, "text": message}
    response = requests.get(url, params=params)
    return response.json()

### *pdf_to_text*

Extract the text from pdf files, page by page

In [4]:
def pdf_to_text(file_path):
    """
    Processes a single PDF file to extract exam information and statements.

    Args:
        file (Path): A Path object representing the PDF file to be processed.
    
    Returns:
        dict: A dictionary containing the extracted information with the following structure:
            {
                "key": {
                    "subject": list of subjects,
                    "year": list of years,
                    "topic": list of topics,
                    "exam": list of exams,
                    "exercise": list of exercises,
                    "statement": list of statements
                }
            }

    Workflow:
        1. Extracts the topic from the file name.
        2. Converts the PDF file into text, page by page.
        3. For each page:
            - Extracts the statement using the `extract_statement` function.
            - Extracts exam details using the `extract_exam_details` function.
            - Handles errors by logging them using `generate_error_log`.
        4. Aggregates the extracted information into lists for subjects, years, topics, exams, exercises, and statements.
        5. Returns a dictionary with the extracted data.

    Notes:
        - Logs errors to a file if there are issues with extracting statements or exam details.
        - Skips pages with errors and continues processing the remaining pages.
    """
    images = convert_from_path(file_path)[1:]
    # parse each page
    pages = []
    for i, page in enumerate(images):
        pages.append(pytesseract.image_to_string(page, lang='spa'))
    return pages

### *extract_statement*
Extract the statement from a page in text format

In [5]:
def extract_statement(page):
    # split the content into lines
    text = page.split('\n')

    # list to store the processed lines
    final_text = []
    # start from the 4th line
    for line in text[4:]:

        # check the first word of the line
        first_word = line.replace('.', '').split(' ')[0].strip().upper()

        # stop at examen details line
        if first_word in ['SOCIALES', 'FISICA', 'MATES',
                          'QUÍMICA', 'QUIMICA', 'QUIÍMICA']:
            final_text.append(line.strip())
            break
        
        # add the line to final_text
        else:
            if not line == '':
                final_text.append(line.strip())

    # join the list into a single string
    final_text = '\n'.join(final_text)

    return final_text

In [6]:
'a' in ['a', 'b']

True

### *extract_exam_details*
Extract the exam details, like subject, year, exam and exercise number.

In [7]:
def extract_exam_details(statement):
    # get last line with exam details
    lines = [i for i in statement.split('\n') if i != '']
    exam_details = lines[-1].lower()

    # strip whitespaces
    exam_details = (exam_details.replace('.', '')            
                    .replace(',', '')
                    .replace('(', '')
                    .replace(')', '').strip()
    )
    
    # split the words by ' '
    exam_details = exam_details.split(' ')

    # MATES CCSS
    # SOCIALES II. 2017 JUNIO. EJERCICIO 2. OPCIÓN A
    # 
    if exam_details[0].startswith('sociales'):
        exam_details[0] = 'MATES_CCSS'
        # remove the second element in the list
        del exam_details[1]
    
       
    # exam dictionary
    details = ['subject', 'year', 'exam', 'exercise']
    exam_dict = dict.fromkeys(details)
    exam_dict = {k:None for k in details}

    ## fill dictionary with values
    
    # subject
    exam_dict['subject'] = exam_details[0]
    
    # year
    # SOCIALES II. PONENCIA 2009. EJERCICIO 2
    if exam_details[1] == 'ponencia':
        # switch ponencia and year
        exam_details[1], exam_details[2] = exam_details[2], exam_details[1]

    exam_dict['year'] = int(exam_details[1])

    # parse the exam string
    if exam_details[2].startswith('reserva'):
        exam_dict['exam'] = ' '.join(exam_details[2:4]).title()
        exam_index_start = 4
    else:
        exam_dict['exam'] = exam_details[2].title()
        exam_index_start = 3
    
    exam_dict['exercise'] = ' '.join(exam_details[exam_index_start:]).title()

    
    return exam_dict

### *time*
Function to measure the time taken to process the files

In [8]:
import time
def what_time():
    now = time.localtime()
    time_formated = time.strftime("%H:%M:%S on %Y-%m-%d ", now)
    return now, time_formated

def elapsed_time_minutes(start, end):
    start_seconds = time.mktime(start)
    end_seconds = time.mktime(end)
    return (end_seconds - start_seconds) / 60

### *generate_error_log*
generate a error log and add it to errors_timestamp.log file

In [9]:
def generate_error_log(file, page, error_type, statement = None, details = None):
    log = f'Error processing {error_type}, in {file.name} at page {page+2}\n'
    
    # add timestamp to log file
    current_datetime = datetime.now()
    formatted_dt = current_datetime.strftime("%d-%m-%Y")
    with open(f'./error_logs/errors_{formatted_dt}.log', 'a') as f:
        f.write(log)
        f.write('*' * 10 + '\n')
        if statement:
            f.write(f'{statement}\n')
        elif details:
            f.write(f'{details}\n')
        f.write('*' * 10 + '\n')
    
    # print log to console without the new line
    print(log[:-2])

### *add_row*
Manually add a row to the processed exercises

In [10]:
def add_row(df, statement, exam_details):
    
    row = extract_exam_details(exam_details)
    row['statement'] = statement

    return pd.concat([df, pd.DataFrame(row, columns = df.columns, index = [0])], axis = 0)

### *process_file*
Process a pdf files, extracting its information: statement and exam information 

In [None]:
def process_file(file: str) -> dict:
    exercises_dict = {}
    file = Path(file)
    topic = file.stem.split(' - ')[-1]   
    
    # lists to store the values
    subjects = []
    years = []
    exams = []
    exercises = []
    statements = []
    pages = pdf_to_text(file)

    # parse each page in the pdf file

    output = f'Success parsing file: {file.stem}'
    
    for index, page in enumerate(pages):

        # extract statement from page
        try:
            statement = extract_statement(page)
        # error with extracting statement
        except:
            for line in statement.split('\n'):
                if line.startswith('www.emestrada.org'):
                    continue
            generate_error_log(file = file, page = index, error_type= 'statement', statement=page)
            continue

        # extract exam details
        try:
            details = extract_exam_details(statement)
            subjects.append(details['subject'])
            years.append(details['year'])
            exams.append(details['exam'])
            exercises.append(details['exercise'])
            statements.append(statement)
        # error with extracting details
        except:
            # pages ending with www.emestrada.org are pages with solution to exercise
            if not statement.endswith('www.emestrada.org'):
                generate_error_log(file = file, page = index, error_type= 'details', details = statement)
                output = f'File {file.stem} parsed but with errors'
            continue
        
    print(output)

    # generate the key and content for the exercises dictionary
    if details:
        key = details['subject'] + ' ' + str(details['year']) + ' ' + topic
        exercises_dict[key] = {
            'subject' : subjects,
            'year' : years,
            'topic' : [topic] * len(subjects),
            'exam' : exams,
            'exercise' : exercises,
            'statement' : statements
        }
        return exercises_dict

### *create_content_dict*
Creates a python dictionary with the exam details and its statement

In [12]:
def create_content_dict():
    # create the dict from keys
    keys = ['subject', 'year', 'topic', 'exam', 'exercise', 'statement']
    #content = dict.fromkeys(exercises_dict[first_processed_file].keys())
    content = dict.fromkeys(keys)
    # fill the dict with empty lists
    for key in content.keys():
        content[key] = []

    return content

### *process_folder*
Processes all files within a folder

In [13]:
def process_folder(folder:str, csv_file:str = None, return_df: bool = False) -> pd.DataFrame:
    df = pd.DataFrame()
    
    # get the list of pdf files within the folder
    folder_Path = Path(folder)
    files = sorted( [i for i in Path.iterdir(folder_Path)
                        if i.suffix == '.pdf'] )
    
    # process each pdf file to extract its information
    for file in files:
        try:
            exercises_dict = process_file(file)
            # each year contains a dictionary with keys containing lists of values
            for year, dict_ in exercises_dict.items():
                df = pd.concat([df, pd.DataFrame(dict_)], axis = 0)

        except:
            pass
        
    # correct wrong subjects
    #df.subject = df.subject.apply(lambda x: 'Química'
    #                                if x.endswith('mica') else x)
    
    # export the output of the processed pdf files to a csv file
    if csv_file:  # if a file name is provided
        output_file = f'./csv/{csv_file}.csv'
    else:
        output_file = f'./csv/exercises_{folder_Path.stem}.csv'
    
    df.to_csv(output_file, index=False)

    if return_df:
        return df

## Process pdf files

In [14]:
folders = sorted([i for i in Path('./pdf_files/').iterdir() if i.is_dir()])
folders = folders[:-1]
folders

[PosixPath('pdf_files/FISICA'),
 PosixPath('pdf_files/MATES_CCSS'),
 PosixPath('pdf_files/QUIMICA')]

In [None]:
def process_all_files_in_folder(folder:str) -> None:
    start, start_formatted = what_time()
    send_telegram_message(f'Parsing process started at: {start_formatted}')

    subject_df = pd.DataFrame()
    folder = Path(folder)
    subfolders = sorted(folder.iterdir())
    for subfolder in subfolders:
        try:
            
            topic_df = process_folder(subfolder, return_df = True)
            print(f'Success processing folder: {subfolder.stem}')
            send_telegram_message(f'Success processing folder: {folder.stem}')
            subject_df = pd.concat([subject_df, topic_df], axis = 0)
            
        except Exception as e:
            send_telegram_message(f'Error processing folder: {subfolder.stem}' \
                                f'\n{e}')
    
    return subject_df

    end, end_formatted = what_time()

    send_telegram_message(f'Parsing process finished at: {end_formatted}'\
                        f'\nTime elapsed in minutes: {elapsed_time_minutes(start, end)}')
    send_telegram_message('✅ Your files have been processed!')

In [None]:
QUIMICA_df = process_all_files_in_folder('./pdf_files/QUIMICA')

Success parsing file: 2000
Success parsing file: 2001
Success parsing file: 2002
Success parsing file: 2004
Success parsing file: 2006
Success parsing file: 2007
Success parsing file: 2008
Success parsing file: 2009
Success parsing file: 2010
Success parsing file: 2011
Success parsing file: 2013
Success parsing file: QUIM T5 2012
Success parsing file: QUIM T5 2014
Success parsing file: QUIM T5 2015
Success parsing file: QUIM T5 2017
Success parsing file: QUIM T5 2018
Success processing folder: CONTIENE CINÉTICA
Success parsing file: 2000 - Ácido Base
Success parsing file: 2001 - Ácido Base
Success parsing file: 2002 - Ácido Base
Success parsing file: 2003 - Ácido Base
Success parsing file: 2004 - Ácido Base
Success parsing file: 2005 - Ácido Base
Success parsing file: 2006 - Ácido Base
Success parsing file: 2007 - Ácido Base
Success parsing file: 2008 - Ácido Base
Success parsing file: 2009 - Ácido Base
Success parsing file: 2010 - Ácido Base
Success parsing file: 2011 - Ácido Base
Suc

In [None]:
QUIMICA_df

In [30]:
MATES_CCSS_df.loc[MATES_CCSS_df.exam == 'Tunto', 'exam'] = 'Junio'

In [31]:
for col in MATES_CCSS_df.columns[:-1]:
    print(f'{col}: {MATES_CCSS_df[col].unique()}')

subject: ['MATES_CCSS']
year: [2001 2002 2003 2004 2005 2006 2007 2009 2010 2011 2012 2013 2014 2015
 2016 2008 2017 2018 2019 2020 2021 2022 2023 2024]
topic: ['Sistemas de ecuaciones lineales' 'Contraste De Hipótesis' 'Probabilidad'
 'Inferencia Estadística' 'Funciones CCSS' 'Programación Lineal'
 'Matrices Y Determinantes']
exam: ['Reserva 4' 'Septiembre' 'Junio' 'Reserva 2' 'Reserva 3' 'Reserva 1'
 'Ponencia' 'Julio' 'Modelo']
exercise: ['Ejercicio 1 Opción B' 'Ejercicio 1 Opción A' 'Ejercicio 1 Opcion A'
 'Ejercicio 1' 'Ejercicio 2' 'Ejercicio 3' 'Ejercicio 4' 'Ejercicio 5'
 'Ejercicio 6' 'Ejercicio 7' 'Ejercicio 8' 'Ejercicio 9' 'Ejercicio 10'
 'Ejercicio 11' 'Ejercicio 12' 'Ejercicio 13' 'Ejercicio 14'
 'Ejercicio 15' 'Ejercicio 16' 'Ejercicio 17' 'Ejercicio 18'
 'Ejercicio 19' 'Ejercicio 20' 'Ejercicio 21' 'Ejercicio 22'
 'Ejercicio 23' 'Ejercicio 24' 'Ejercicio 25' 'Ejercicio 26'
 'Ejercicio 27' 'Ejercicio 4 Opción B' 'Ejercicio 4 Opcióon A'
 'Ejercicio 4 Opción A' 'Ejercicio 

In [32]:
MATES_CCSS_df.to_csv('./csv/MATES_CCSS.csv', index=False)

## Testing

### Get random files to test parsing

In [None]:
from numpy.random import choice, seed

In [None]:
def sample_topics(files_per_folder:int = 5) -> list:
    sample_files = []
    seed(42)
    subjects = sorted([i for i in Path('./pdf_files/').iterdir() if i.is_dir()])
    # remove samples folder
    subjects = subjects[:-1]

    sample_files = []
    for s in subjects:
        for subfolder in s.iterdir():
            print(f'Processing {subfolder.stem}')
            files = list(subfolder.glob('**/*.pdf'))
            sample_files.extend(choice(files, files_per_folder, replace = False))  

    print(f'{len(sample_files)} files sampled')

    return sample_files 
    

In [None]:
def sample_subjects(files_per_folder:int = 3) -> list:
    sample_files = []
    seed(42)

    for f in folders:
        
        files = list(f.glob("**/*.pdf"))
        sample_files.extend(choice(files,files_per_folder, replace=False))

    print(f'Files selected: {len(sample_files)}')
    
    return sorted(sample_files)    

In [None]:
sample_files = sample_subjects(5)

In [None]:
sample_files = sample_topics(2)

### Save the sample files into folder

In [None]:
import shutil

In [None]:
os.makedirs('pdf_files/sample_files', exist_ok=True)

In [None]:
for file in sample_files:
    shutil.copy(file, f'pdf_files/sample_files/')

### Process the sample files 

In [None]:
df = process_folder('pdf_files/sample_files')

In [None]:
df

In [None]:
for col in df.columns[:-1]:
    print(f'{col}: {df[col].unique()}')

In [None]:
pages = pdf_to_text('pdf_files/folder/2003 - Conf. Electrónica.pdf')

In [None]:
st = extract_statement(pages[0])
print(st)

In [None]:
extract_exam_details(st)

In [None]:
df = pd.read_csv('csv/exercises_sample_files.csv')

## Check processing of files

1. Create path to file to process
2. Use `pdf_to_text` to process the pdf file. Store output in `pages`
3. Extract statement using `extract_statement`

In [None]:
file = Path('pdf_files/MATES_CCSS/Contraste de Hipótesis/2016 - Contraste De Hipótesis.pdf')
pages = pdf_to_text(file)

In [None]:
statement = extract_statement(pages[0])
statement.split('\n')[-1]

In [None]:
details = extract_exam_details(statement)
details

In [None]:
statement.split('\n')[-1]

## Manual Addition of exercises

#### Load csv into a dataframe

In [None]:
df = pdf.read_csv('./csv/exercises_Funciones CCSS.csv')

In [34]:
statement = '''Se consideran las matrices:
7 -6 —2
A=|1 3 1 4
-5 0
a) Resuelva la ecuación matricial A' — X - A=3-1,.
b) ¿Existe algún valor del parámetro a para el que se verifique C' - D = B ?. En caso afirmativo,
calcule dicho valor.'''


# SOCIALES II. 2017 JUNIO. EJERCICIO 2. OPCIÓN A
exam_details = 'SOCIALES II. 2022 RESERVA 4. EJERCICIO A2'

In [35]:
extract_exam_details(exam_details)

{'subject': 'MATES_CCSS',
 'year': 2022,
 'exam': 'Reserva 4',
 'exercise': 'Ejercicio A2'}

In [36]:
df = add_row(MATES_CCSS_df, statement, exam_details)

In [39]:
df.topic.fillna('Matrices y Determinantes', inplace=True)

In [41]:
df.to_csv(f'/home/daniel/git_code/emestrada/csv/MATES CCSS/MATES_CCSS.csv', index=False)

## Crete combined csv file for subject

Create the pandas dataframe

In [None]:
csv_files = [i for i in Path.iterdir(Path('./csv'))
             if i.suffix == '.csv' and not i.stem.startswith('all')]

In [None]:
csv_files

In [None]:
df = pd.read_csv(csv_files[0])

In [None]:
# df = pd.read_csv(csv_files[0])
# for file in csv_files[1:]:
#     df = pd.concat([df, pd.read_csv(file)], axis = 0)
#     #df.to_csv('./csv/all_exercises.csv', index=False)

# df.sample(10)

In [None]:
df.isna().sum()

In [None]:
df[~df.year.isna()]

In [None]:
df.shape

In [None]:
df.subject.unique()

In [None]:
df.exercise.unique()

In [None]:
df.exercise = df.exercise.apply(lambda x: x.replace('Opcióon', 'Opción').replace('Opcion', 'Opción'))

In [None]:
def add_comma(string):
    if len(string.split(' ')) == 4 and ',' not in string:
        words = string.split(' ')
        return f'{words[0]} {words[1]}, {words[2]} {words[3]}'
    else:
        return string

In [None]:
df.exercise = df.exercise.apply(add_comma)

In [None]:
df.to_csv(f'./csv/all_exercises_{df.subject.unique()[0]}.csv', index=False)