### Import libraries

In [1]:
# Import libraries
import pdftotext
import re
import pandas as pd
from glob import glob
from collections import OrderedDict 
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


### Extract text from PDF

In [2]:
# Function to extract text from PDF file
def extract_text(path):
    
    # Read PDF file
    with open(path, "rb") as f:
        # Load PDF file
        pdf = pdftotext.PDF(f)
        pdf_text = ""
        # Iterate over all the pages
        for page in pdf:
            # Concatenate strings from pages
            pdf_text += page
    return pdf_text

In [3]:
# Create a list with all files in the directory with 'pdf' extension
pdfs = glob('../assets/pdfs/*.pdf')

texts = []
for pdf in pdfs:

    # Apply function to extract text from PDF files
    pdf_text = extract_text(pdf)
    
    # Append text to list
    texts.append(pdf_text)

### Clean text

In [4]:
texts_cleanned = []
for text in texts:
    
    # Remove new lines
    text_cleanned = re.sub("\n", " ", text)

    # Remove multiple spaces
    text_cleanned = re.sub(' +', ' ', text_cleanned)

    # Replace tokens in uppercase including accented characters with title() method
    text_cleanned = re.sub(r'\b[A-ZÀ-ÿ]{2,}\b', lambda x: x.group().title(), text_cleanned)

    # Append 'text_cleanned' to list
    texts_cleanned.append(text_cleanned)

### Summarize texts

In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert2bert_shared-spanish-finetuned-summarization")

# Load model for Spanish summarization
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/bert2bert_shared-spanish-finetuned-summarization")

# Create instance for summarization task
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


In [7]:
summaries = []
for text in tqdm(texts):

    # Apply summarizer function to text
    summary = summarizer(text, truncation=True, max_length=512)

    # Append summary to list
    summaries.append(summary)

100%|██████████| 20/20 [07:53<00:00, 23.68s/it]


In [18]:
summaries_cleanned = []
for i in range(len(summaries)):
    summaries_cleanned.append(list(summaries[i][0].values())[0])

# Create dataframe from list
df = pd.DataFrame(summaries_cleanned, columns=['resumen'])

# Add new column
df['texto'] = texts_cleanned

df.to_csv('../data/processed/pdf_texto.csv')

### Extract information from text

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/spanish-t5-small-sqac-for-qa")

# Load model for Spanish Question-Answering
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/spanish-t5-small-sqac-for-qa")

# Create instance for text2text-generation task
get_answer = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

In [6]:
records = []
for text in tqdm(texts_cleanned):

    # Define question to answer
    question = '¿Cuál es el nombre del expediente?'

    # Apply question answering function to text
    record = get_answer(f'question: {question}  context: {text}', truncation=True, max_length=512)
   
    # Append record to list
    records.append(record)

  0%|          | 0/20 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 20/20 [22:32<00:00, 67.63s/it] 


In [None]:
records

[[{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}],
 [{'generated_text': 'eduardo rafael riggi'}]]

In [None]:
dates = []
for text in tqdm(texts_cleanned):

    # Define question to answer
    question = '¿Cuál es la fecha de la firma?'

    # Apply question answering function to text
    date = get_answer(f'question: {question}  context: {text}', truncation=True, max_length=512)

    # Append date to list
    dates.append(date)

In [None]:
# Function to extract date from anwser
def clean_date(data):
    # Filter first value from dictionary
    date_filtered = list(data[0].values())[0]
    # Find all matching dates
    date_cleanned = re.findall(r'(\d+/\d+/\d+)', date_filtered)
    date = date_cleanned[0]
    return date

