In [4]:
import fitz
import pandas as pd
from pathlib import Path
import csv

## Files management

In [5]:
def find_files(directory, pattern):
    path = Path(directory)
    return path.rglob(pattern)

# Example usage
directory = "./pdf_files"
pattern = "*.pdf"

file_paths = [i for i in find_files(directory, pattern) 
              if i.name.startswith('2')
              ]
print(*file_paths, sep='\n')

pdf_files/2005 - Reacciones Redox.pdf
pdf_files/2008 - Reacciones Redox.pdf
pdf_files/2022 - Reacciones Redox.pdf
pdf_files/2000 - Reacciones Redox.pdf
pdf_files/2003 - Reacciones Redox.pdf
pdf_files/2004 - Reacciones Redox.pdf
pdf_files/2002 - Reacciones Redox.pdf
pdf_files/2016 - Reacciones Redox.pdf
pdf_files/2021 - Reacciones Redox.pdf
pdf_files/2024 - Reacciones Redox.pdf
pdf_files/2007 - Reacciones Redox.pdf
pdf_files/2023 - Reacciones Redox.pdf
pdf_files/2015 - Reacciones Redox.pdf
pdf_files/2013 - Reacciones Redox.pdf
pdf_files/2011 - Reacciones Redox.pdf
pdf_files/2001 - Reacciones Redox.pdf
pdf_files/2014 - Reacciones Redox.pdf
pdf_files/2019 - Reacciones Redox.pdf
pdf_files/2017 - Reacciones Redox.pdf
pdf_files/2018 - Reacciones Redox.pdf
pdf_files/2020 - Reacciones Redox.pdf
pdf_files/2009 - Reacciones Redox.pdf
pdf_files/2012 - Reacciones Redox.pdf
pdf_files/2010 - Reacciones Redox.pdf
pdf_files/2006 - Reacciones Redox.pdf


In [7]:
def save_dict_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write the header
        writer.writerow(data.keys())
        
        # Write the values
        writer.writerow(data.values())

In [28]:
def extract_info(file_path):
    
    content = {
        'subject': [],
        'year': [],
        'exam': [],
        'exercise': [],
        'statement': [],
        'page': [],
        'exam_details': []
    }
    # open the pdf file
    doc = fitz.open(file_path)

    # process individual pages
    for page in doc[1:]:
        
        # get content of the page
        text = page.get_text("text")
        
        # clean the content of page
        
        # replacements
        replacements = {'www.emestrada.org' : '',
                        'R  E  S  O  L  U  C  I  Ó  N': 'Resolucion',
                        '\n \n \n' : ''}

        for subs, rep in replacements.items():
            text =  text.replace(subs, rep)
        
        # remove whitespaces and create list of rows
        text = text.strip().replace('  ', ' ').split('\n')

        # extract exam details
        exam_details = text[-1]
        text = text[:-1]

        # extract only exercise statement
        times_a_appeared = 0
        for index, row in enumerate(text):
            if 'a)' in row:
                times_a_appeared += 1
                # it's the second occurrence of a)
                if times_a_appeared == 2:
                    text = text[index:]
                    break
        
        # clean relative masses
        for index, row in enumerate(text):
            if ('Masas atómicas relativas' in row) or ('Dato:' in row) :
                text = text[:index]
                break
        
        # remove the lines starting with just a number
        text = [i for i in text if len(i) > 1]
        
        # join the lines of the statement
        joined_text = ''
        for row in text:

            if row.startswith('b'):
                joined_text += '\n' + row
            else:
                joined_text += row
        statement = joined_text

        # extract the year and exam
        try:
            exam = exam_details.split(' ')
            # extract subject
            subject = exam[0].replace('.', '').title()
            # extract year
            year = exam[1].replace('.', '')
            # get exam
            if exam[2] == 'RESERVA':
                exam = exam[2].title() +  ' ' + ' '.join(exam[3])
            else:
                exam = exam[2].title()
            # get exercise
            if exam[2] == 'RESERVA':
                exercise = ' '.join(exam[4:]).title()
            else:
                exercise = ' '.join(exam[3:]).title()
        except:
            print(f'Problems with exercise data in {file_path.name}' \
                  f' on page {page.number + 1}.')
            print(exam_details)
            continue
                   
        
        page =page.number + 1

        # update the content dict
        content['subject'].append(subject)
        content['year'].append(year)
        content['exam'].append(exam)
        content['exercise'].append(exercise)
        content['statement'].append(statement)
        content['page'].append(page)
        content['exam_details'].append(exam_details)
        
        # content[f'{file_path.name}_page_{page.number + 1}'] = page_dict

    return content

### Check exam string

In [25]:
def check_exam_string(exam_details):
    page_dict = {}
    exam = exam_details.split(' ')

    # extract subject
    page_dict['subject'] = (exam[0]
                            .replace('.', '')
                            .title()
    )
    page_dict['year'] = exam[1].replace('.', '')
    # get exam
    if exam[2] == 'RESERVA':
        page_dict['exam'] = exam[2].title() +  ' ' + ' '.join(exam[3])
    else:
        page_dict['exam'] = exam[2].title()
    # get exercise
    if exam[2] == 'RESERVA':
        page_dict['exercise'] = ' '.join(exam[4:]).title()
    else:
        page_dict['exercise'] = ' '.join(exam[3:]).title()

    return page_dict

In [30]:
extract_info(file_paths[0])

{'subject': ['Química',
  'Quimica',
  'Quimica',
  'Quimica',
  'Quimica',
  'Quimica',
  'Quimica',
  'Química'],
 'year': ['2005', '2005', '2005', '2005', '2005', '2005', '2005', '2005'],
 'exam': ['Junio.',
  'Reserva 1 .',
  'Reserva 2',
  'Reserva 3',
  'Reserva 3',
  'Reserva 4',
  'Reserva 4',
  'Septiembre.'],
 'exercise': ['I O .',
  'E R V A   1   .',
  'E R V A   2',
  'E R V A   3',
  'E R V A   3',
  'E R V A   4',
  'E R V A   4',
  'T I E M B R E .'],
 'statement': ["a) Ajuste por el método del ión-electrón esta reacción en sus formas iónica y molecular. \nb) Calcule la masa de cobre que se necesita para obtener 5 litros de NO medidos a 750 mm de Hg y 40ºC Datos: R = 0’082atm L Kmol. Masa atómica: Cu63'5",
  'a) Ajuste la reacción iónica por el método del ión-electrón. \nb) Calcule la molaridad de una disolución de KMnO , sabiendo que a partir de 50 mL de la misma se pueden obtener 0’34 moles deFe \uf02b. ',
  'normales. Calcule: a) El tiempo que ha durado la electrolis

In [33]:
content = {}
topic = 'Redox'
run = 1
for file in file_paths:
    if run == 1:
        content = extract_info(file)
        run += 1
    else:
        new_content = extract_info(file)
    try:
        for key in content.keys():
            content[key].extend(new_content[key])
    except:
        print(f'Problems with {file.name}')

Problems with 2005 - Reacciones Redox.pdf
Problems with exercise data in 2020 - Reacciones Redox.pdf on page 2.



In [None]:
def save_dict_to_csv(data, filename):
    
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write the header
        writer.writerow(data.keys())
        
        # Write the values
        writer.writerow(data.values())

In [221]:
# export to csv
save_dict_to_csv(content.values(), f'{topic}.csv')

AttributeError: 'dict_values' object has no attribute 'keys'