### Installation

In [None]:
!pip install pdftotext
!pip install spacy
!pip install stanza
!pip install transformers
!python3 -m spacy download es_core_news_md

### Import libraries

In [1]:
# Import libraries
import pdftotext
import re
import spacy
import stanza 
import pandas as pd
from collections import OrderedDict 
# Download Spanish models
stanza.download('es')
import es_core_news_md

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 23.6MB/s]                    
2023-01-25 11:46:17 INFO: Downloading default packages for language: es (Spanish) ...
2023-01-25 11:46:19 INFO: File exists: /home/fxr/stanza_resources/es/default.zip
2023-01-25 11:46:24 INFO: Finished downloading models and saved to /home/fxr/stanza_resources.


### Load NER model

In [2]:
# Create an instance for stanza's Named Entity Recognition model
nlp_st = stanza.Pipeline(lang='es', processors='tokenize, ner')
# Create an instance for spacy's Named Entity Recognition model
nlp_sp = es_core_news_md.load()

2023-01-25 11:46:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 11.8MB/s]                    
2023-01-25 11:46:25 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| ner       | conll02 |

2023-01-25 11:46:25 INFO: Use device: cpu
2023-01-25 11:46:25 INFO: Loading: tokenize
2023-01-25 11:46:25 INFO: Loading: mwt
2023-01-25 11:46:25 INFO: Loading: ner
2023-01-25 11:46:25 INFO: Done loading processors!


### Extract text from PDF

In [3]:
# Function to extract text from PDF file
def extract_text(path):
    # Read PDF file
    with open(path, "rb") as f:
        pdf = pdftotext.PDF(f)
        pdf_text = ""
        # Iterate over all the pages
        for page in pdf:
            pdf_text += page
    return pdf_text

In [4]:
# Apply extract_text function to PDF document
pdf_text = extract_text("../data/raw/fallo1.pdf")

### Clean text

In [5]:
# Extract the second line from the document
subtitle = re.findall(r'^.*\n(.*)\n', pdf_text)

# Remove new lines
subtitle_cleanned = re.sub("\n", " ", subtitle[0])

# Remove multiple spaces
subtitle_cleanned = re.sub(' +', ' ', subtitle_cleanned)

# Replace tokens in uppercase including accented characters with title() method
institucion = re.sub(r'\b[A-ZÀ-ÿ]{2,}\b', lambda x: x.group().title(), subtitle_cleanned)

institucion

' Camara Federal De La Plata - Sala De Feria'

In [6]:
# Remove new lines
text_cleanned = re.sub("\n", " ", pdf_text)

# Remove multiple spaces
text_cleanned = re.sub(' +', ' ', text_cleanned)

# Replace tokens in uppercase including accented characters with title() method
text_cleanned = re.sub(r'\b[A-ZÀ-ÿ]{2,}\b', lambda x: x.group().title(), text_cleanned)

### Extract entitites from text

In [7]:
# Apply spacy NER model to text
doc_sp = nlp_sp(text_cleanned)

# Apply stanza NER model to text
doc_st = nlp_st(text_cleanned)

In [8]:
# Extract people from text
per = [ent.text for sent in doc_st.sentences for ent in sent.ents if ent.type == "PER"]
unique_per = list(OrderedDict.fromkeys(per))
unique_per

['Carlos Alberto Vallefin',
 'Juez De Camara',
 'Roberto Agustin Lemos Arias',
 'Maria Virginia Filipas',
 'Secretaria De Feria',
 'Ii',
 'Regístrese',
 'Carlos Alberto Vallefín',
 'Roberto Agustín Lemos Arias',
 'María Virginia Filipas']

### Summarize a text

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert2bert_shared-spanish-finetuned-summarization")

model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/bert2bert_shared-spanish-finetuned-summarization")

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

resumen = summarizer(pdf_text, truncation=True, max_length=512)

resumen

The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


[{'summary_text': 'Las asociaciones civiles Nuevo Ambiente y Banco de Bosques denuncian que el Gobierno de Luján no puede autorizar el recurso de la empresa'}]

### Extract information about text

In [10]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("mrm8488/spanish-t5-small-sqac-for-qa")

model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/spanish-t5-small-sqac-for-qa")

get_answer = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

question = '¿Cuál es el nombre del expediente?'

expediente = get_answer(f'question: {question}  context: {text_cleanned}', truncation=True, max_length=512)

expediente


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'generated_text': 'flp 49311/2022/1/rh1 “recurso de queja no 1 – asociación civil nuevo ambiente y otro c/ e.v.a.s.a y otro s/ amparo ambiental'}]

In [11]:
question = '¿Cuál es la fecha de la firma?'

fecha_firma = get_answer(f'question: {question}  context: {text_cleanned}', truncation=True, max_length=512)

fecha_firma

[{'generated_text': 'fecha de firma: 09/01/2023'}]

In [12]:
# Function to extract date from anwser
def clean_date(data):
    # Filter first value from dictionary
    date_filtered = list(data[0].values())[0]
    # Find all matching dates
    date_cleanned = re.findall(r'(\d+/\d+/\d+)', date_filtered)
    date = date_cleanned[0]
    return date

fecha = clean_date(fecha_firma)

fecha

'09/01/2023'

In [13]:
question = '¿Cuál es el delito?'

delito = get_answer(f'question: {question}  context: {text_cleanned}', truncation=True, max_length=512)

delito


[{'generated_text': 'privación de la causa'}]

In [14]:
question = '¿Qué se resuelve?'

resolucion = get_answer(f'question: {question}  context: {text_cleanned}', truncation=True, max_length=512)

resolucion

[{'generated_text': 'la acumulación de la causa flp 49311/2022/2/rh2 a esta causa'}]

In [15]:
filter_ids = re.findall(r'(#+\d+\S+)', text_cleanned)
id = list(OrderedDict.fromkeys(filter_ids))
id

['#37413263#354963292#20230109132436969']

### Create a dataframe

In [16]:
data = [expediente, resumen, delito, resolucion]
columns = ['expediente', 'resumen', 'delito', 'resolucion']

values = []
for i in range(len(data)):
    values.append(list(data[i][0].values())[0])


In [20]:
df = pd.DataFrame(columns=[columns], data=[values])

df

Unnamed: 0,expediente,resumen,delito,resolucion
0,flp 49311/2022/1/rh1 “recurso de queja no 1 – ...,Las asociaciones civiles Nuevo Ambiente y Banc...,privación de la causa,la acumulación de la causa flp 49311/2022/2/rh...


In [21]:
# Add new column
df.insert(1, "fecha", [fecha])

# Convert fecha column to datetime format
df['fecha'] = df['fecha'].astype('datetime64[ns]')

df

Unnamed: 0,expediente,fecha,resumen,delito,resolucion
0,flp 49311/2022/1/rh1 “recurso de queja no 1 – ...,2023-09-01,Las asociaciones civiles Nuevo Ambiente y Banc...,privación de la causa,la acumulación de la causa flp 49311/2022/2/rh...


In [22]:
# Add new column
df.insert(2, "id", [id[0]])

# Add new column
df.insert(3, "institucion", [institucion])

# Add new column
df.insert(7, "texto", text_cleanned)

df

Unnamed: 0,expediente,fecha,id,institucion,resumen,delito,resolucion,texto
0,flp 49311/2022/1/rh1 “recurso de queja no 1 – ...,2023-09-01,#37413263#354963292#20230109132436969,Camara Federal De La Plata - Sala De Feria,Las asociaciones civiles Nuevo Ambiente y Banc...,privación de la causa,la acumulación de la causa flp 49311/2022/2/rh...,Poder Judicial de la Nación Camara Federal De...
