In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import openai

from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv('OPENAI_KEY')

# First experiment

In [None]:
from utils.constants import STAGE_DIR

with open(STAGE_DIR / '01_sentences.txt', 'r') as f:
    sentences = f.readlines()

In [None]:
sentences = '''
Los trabajadores tienen como derechos básicos, con el contenido y alcance que para cada uno de los mismos disponga su específica normativa, los de Negociación colectiva

Los trabajadores menores de dieciocho años no podrán realizar trabajos nocturnos ni aquellas actividades o puestos de trabajo respecto a los que se establezcan limitaciones a su contratación conforme a lo dispuesto en la Ley 31/1995, de 8 de noviembre, de Prevención de Riesgos Laborales, y en las normas reglamentarias aplicables

Si el representante legal de una persona de capacidad limitada la autoriza expresa o tácitamente para realizar un trabajo, queda esta también autorizada para ejercitar los derechos y cumplir los deberes que se derivan de su contrato y para su cesación

Podrán contratar la prestación de su trabajo Los extranjeros, de acuerdo con lo dispuesto en la legislación específica sobre la materia

El contrato de trabajo se podrá celebrar por escrito o de palabra
'''

prompt = f'''
Given a large set of sentences in spanish from the legal domain, written between triple backticks, your objective is to develop a spanish event extraction task. The steps to achieve it are the following:
1. Identify each sentence in the corpus separated for new lines.
2. In each sentence detect a subject entity, an object entity and an event trigger, usually in the form of a verb. It can happen that a sentence relates more than one object entity with the same subject and event trigger. The sentences can contain entities and phrases that don't correspond to any classification. Also, the object can be separated from the subject and event trigger by these complements. The definitions of each category are the following:
    * event trigger: It refers to the action that is enforced by the legal text. It can be in a negative form, e.g. "no pueden".
    * subject entity: It refers to the entity that most follow, do the action of the event trigger.
    * object entity: It refers to the entity that is the receptor of the action. In the legal domain, it can be, for example, a right, a beneficiary from the action, an institution, a non-right, etc. 
    
    Example 1:
    Input: Cuando la relación laboral sea de duración superior a cuatro semanas, el empresario deberá informar por escrito, en los términos y plazos que se establezcan reglamentariamente, al trabajador sobre los elementos esenciales del contrato y las principales condiciones de ejecución de la prestación laboral, siempre que tales elementos y condiciones no figuren en el contrato de trabajo formalizado por escrito.
    Output:
        * subject entity: "empresario"
        * object entity: "trabajador"
        * event trigger: "deberá informar por escrito"
        
    Example 2:
    Input: La denegación de la solicitud deberá ser notificada por el empresario al trabajador por escrito y de manera motivada.
    Output:
        * subject entity: "empresario"
        * object entity: "trabajador"
        * event trigger: "deberá ser notificada"

3. The output of the task should be a list of dictionaries, each dictionary contains the following keys:
    - sentence: the sentence
    - subject: the subject entity
    - object: the object entity
    - event: the event trigger
Note that it can be repeated sentences because of the different subject-event-object combinations.

The set of sentences in spanish to use is the following:

```
{sentences}
```
'''

In [None]:
response = get_completion(prompt)

In [None]:
print(response)

# Experiments with `PlanTL-GOB-ES/RoBERTalex`

In [None]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, FillMaskPipeline
from pprint import pprint
tokenizer_hf = AutoTokenizer.from_pretrained('PlanTL-GOB-ES/RoBERTalex')
model = AutoModelForMaskedLM.from_pretrained('PlanTL-GOB-ES/RoBERTalex')
model.eval()
pipeline = FillMaskPipeline(model, tokenizer_hf)
text = f"¡Hola <mask>!"
res_hf = pipeline(text)
pprint([r['token_str'] for r in res_hf])

In [None]:
tokenizer_hf = AutoTokenizer.from_pretrained('PlanTL-GOB-ES/RoBERTalex')
model = AutoModelForMaskedLM.from_pretrained('PlanTL-GOB-ES/RoBERTalex')
model.eval()
pipeline = FillMaskPipeline(model, tokenizer_hf)
text = f"El contrato de trabajo se podrá <mask>"
res_hf = pipeline(text)

pprint([r['token_str'] for r in res_hf])

In [None]:
print(res_hf)

In [None]:
seed = f"El contrato de trabajo se podrá <mask>"
i = 10
while i:
    tokenizer_hf = AutoTokenizer.from_pretrained('PlanTL-GOB-ES/RoBERTalex')
    model = AutoModelForMaskedLM.from_pretrained('PlanTL-GOB-ES/RoBERTalex')
    model.eval()
    pipeline = FillMaskPipeline(model, tokenizer_hf)
    res_hf = pipeline(seed)
    max_score_sequence = max(res_hf, key=lambda x: x['score'])['sequence']
    seed = max_score_sequence + ' <mask>'
    i -= 1

print(seed)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, set_seed
tokenizer = AutoTokenizer.from_pretrained("PlanTL-GOB-ES/gpt2-large-bne")
model = AutoModelForCausalLM.from_pretrained("PlanTL-GOB-ES/gpt2-large-bne")
generator = pipeline('text-generation', tokenizer=tokenizer, model=model)
set_seed(42)
generator("Los  ", num_return_sequences=5)

# Zero-shot experiment

In [84]:
from utils.constants import STAGE_DIR

with open(STAGE_DIR / 'previous-work' / 'EVAL_DATA.txt') as f:
    sentences = f.read()

In [85]:
import re

sentences_re = re.split(r'\d+\t', sentences)

In [87]:
evaluation_data = []
for raw_sentence in sentences_re[1:]:
    sentence = raw_sentence.split('\n')[0]
    sentence_data = {}
    sentence_data['text'] = ''.join(
        re.split(r'\<\w+\>|\</\w+\>', sentence))
    subject = re.search(
        r'(?<=\<e1\>)(.+)(?=\</e1\>)', sentence
    )
    sentence_data['subject'] = subject.group(0) if subject else None
    object = re.search(
        r'(?<=\<e2\>)(.+)(?=\</e2\>)', sentence
    )
    sentence_data['object'] = object.group(0) if object else None
    event = re.search(
        r'(?<=\<rel\>)(.+)(?=\</rel\>)', sentence
    )
    sentence_data['event'] = event.group(0) if event else None
    complement = re.search(
        r'(?<=\<comp\>)(.+)(?=\</comp\>)', sentence
    )
    sentence_data['complement'] = complement.group(0) if complement else None

    relation_signature = re.search(
        r'(?<=RelationSignature:\s)(.+)(?=\s\()', raw_sentence
    )
    sentence_data['relationSignature'] = relation_signature.group(0) if relation_signature else None

    relation_type = re.search(
        r'(?<=RelationType:\s)(.+)(?=\s\()', raw_sentence
    )
    sentence_data['relationType'] = relation_type.group(0) if relation_type else None

    evaluation_data.append(sentence_data)

In [88]:
evaluation_data

[{'text': 'Los trabajos realizados a título de amistad, benevolencia o buena vecindad.',
  'subject': 'vecindad',
  'object': 'trabajos',
  'event': 'realizados',
  'complement': None,
  'relationSignature': 'LegalConcept-LegalConcept',
  'relationType': 'Norelation'},
 {'text': 'Las situaciones de incapacidad temporal, nacimiento, adopción, guarda con fines de adopción, acogimiento, riesgo durante el embarazo, riesgo durante la lactancia y violencia de género interrumpirán el cómputo de la duración del contrato.',
  'subject': 'contrato',
  'object': None,
  'event': 'interrumpirán',
  'complement': None,
  'relationSignature': 'LegalConcept-LegalConcept',
  'relationType': 'Norelation'},
 {'text': 'Si el trabajador tuviera asignadas condiciones o retribuciones especiales en virtud de contraprestaciones establecidas en la parte no válida del contrato, el órgano de la jurisdicción social que a instancia de parte declare la nulidad hará el debido pronunciamiento sobre la subsistencia o 

In [89]:
sentences = "\n\n".join(sentence_data['text'] for sentence_data in evaluation_data)

In [63]:
prompt = f'''
We are interested in extracting event information from spanish legal text. Given a large set of sentences in spanish from the legal domain, written between triple backticks, your objective is to extract the roles from the text written in Spanish following the next steps:
1. Identify each sentence in the corpus separated for new lines.
2. In each sentence detect a subject entity, an object entity, an event trigger and a complement following the next definitions.
    * subject: Agent of the action, who performs the action.
    * event trigger: Action
    * object: Receiver of the action.
    * complement: Item which is handled in the relation.
3. Classify each subject and object in one of the following labels:
    * LegalAgent: Natural person
    * LegalEntity: Not natural person nor individual. Normally a corporation or an enterprise
    * LegalConcept: Not natural person nor corporation.
4. Classify the relation in one of the following classes:
    * Right
    * Duty
    * NoRight
    * Priviledge
    * Norelation
Right and No-Right are opposites and Duty and Priviledge are opposite as well. Norelation represents a sentence that does not contains a relation.
5. The output of the task should be a list of dictionaries, each dictionary contains the following keys:
    * sentence: the sentence
    * subject: the subject entity
    * object: the object entity
    * event: the event trigger
    * complement: the complement
    * relationSignature: the relation signature in the format subject_classification-object_classification with the classification from step 3
    * relationType: the classification of the relation

The set of sentences in spanish to use is the following:

```
{sentences}
```
'''

In [64]:
print(prompt)


We are interested in extracting event information from spanish legal text. Given a large set of sentences in spanish from the legal domain, written between triple backticks, your objective is to extract the roles from the text written in Spanish following the next steps:
1. Identify each sentence in the corpus separated for new lines.
2. In each sentence detect a subject entity, an object entity, an event trigger and a complement following the next definitions.
    * subject: Agent of the action, who performs the action.
    * event trigger: Action
    * object: Receiver of the action.
    * complement: Item which is handled in the relation.
3. Classify each subject and object in one of the following labels:
    * LegalAgent: Natural person
    * LegalEntity: Not natural person nor individual. Normally a corporation or an enterprise
    * LegalConcept: Not natural person nor corporation.
4. Classify the relation in one of the following classes:
    * Right
    * Duty
    * NoRight
   

In [65]:
from utils.functions import get_completion

response = get_completion(prompt)

In [66]:
print(response)

[
    {
        "sentence": "Los trabajos realizados a título de amistad, benevolencia o buena vecindad.",
        "subject": "trabajos",
        "object": null,
        "event": "realizados",
        "complement": "a título de amistad, benevolencia o buena vecindad",
        "relationSignature": "LegalConcept-Norelation",
        "relationType": "Norelation"
    },
    {
        "sentence": "Las situaciones de incapacidad temporal, nacimiento, adopción, guarda con fines de adopción, acogimiento, riesgo durante el embarazo, riesgo durante la lactancia y violencia de género interrumpirán el cómputo de la duración del contrato.",
        "subject": "situaciones",
        "object": null,
        "event": "interrumpirán",
        "complement": "el cómputo de la duración del contrato",
        "relationSignature": "LegalConcept-Norelation",
        "relationType": "Norelation"
    },
    {
        "sentence": "Si el trabajador tuviera asignadas condiciones o retribuciones especiales en virt

In [67]:
import json

evaluation_predict = json.loads(response)

In [None]:
evaluation_predict

In [94]:
from utils.metrics import match_score

match_score(evaluation_data, evaluation_predict, 'exact')

Unnamed: 0,Subject,Object,Event,Total
COR,5.0,2.0,6.0,13.0
INC,6.0,5.0,7.0,18.0
PAR,0.0,0.0,0.0,0.0
MIS,2.0,4.0,0.0,6.0
SPU,0.0,2.0,0.0,2.0
Precision,0.454545,0.222222,0.461538,0.393939
Recall,0.384615,0.181818,0.461538,0.351351
F1 Score,0.416667,0.2,0.461538,0.371429


In [97]:
match_score(evaluation_data, evaluation_predict, 'partial')

Unnamed: 0,Subject,Object,Event,Total
COR,5.0,2.0,6.0,13.0
INC,6.0,5.0,6.0,17.0
PAR,0.0,0.0,1.0,1.0
MIS,2.0,4.0,0.0,6.0
SPU,0.0,2.0,0.0,2.0
Precision,0.454545,0.222222,0.5,0.409091
Recall,0.384615,0.181818,0.5,0.364865
F1 Score,0.416667,0.2,0.5,0.385714


In [101]:
match_score(evaluation_data, evaluation_predict, 'strict')

Unnamed: 0,Subject,Object,Event,Total
COR,4.0,0.0,5.0,9.0
INC,7.0,7.0,8.0,22.0
PAR,0.0,0.0,0.0,0.0
MIS,2.0,4.0,0.0,6.0
SPU,0.0,2.0,0.0,2.0
Precision,0.363636,0.0,0.384615,0.272727
Recall,0.307692,0.0,0.384615,0.243243
F1 Score,0.333333,,0.384615,0.257143


In [104]:
match_score(evaluation_data, evaluation_predict, 'type')

Unnamed: 0,Subject,Object,Event,Total
COR,4.0,0.0,5.0,9.0
INC,9.0,13.0,8.0,30.0
PAR,0.0,0.0,0.0,0.0
MIS,0.0,0.0,0.0,0.0
SPU,0.0,0.0,0.0,0.0
Precision,0.307692,0.0,0.384615,0.230769
Recall,0.307692,0.0,0.384615,0.230769
F1 Score,0.307692,,0.384615,0.230769


In [None]:
from utils.metrics import compute_average_similarity_score

print(f'''
Subject - {compute_average_similarity_score(evaluation_data, evaluation_predict, 'subject')}
Object - {compute_average_similarity_score(evaluation_data, evaluation_predict, 'object')}
Trigger - {compute_average_similarity_score(evaluation_data, evaluation_predict, 'event')}
Complement - {compute_average_similarity_score(evaluation_data, evaluation_predict, 'complement')}
''')


# Few-shot experiment

In [None]:
train_data = evaluation_data[:3]
few_evaluation_data = evaluation_data[3:]
few_sentences = '\n\n'.join(s['text'] for s in few_evaluation_data)

In [None]:
subject_train_data = "\n".join(f"`{s['text']}` -> {s['subject']}" for s in train_data)
object_train_data = "\n".join(f"`{s['text']}` -> {s['object']}" for s in train_data)
event_train_data = "\n".join(f"`{s['text']}` -> {s['event']}" for s in train_data)
complement_train_data = "\n".join(f"`{s['text']}` -> {s['complement']}" for s in train_data)

In [None]:
prompt = f'''
We are interested in extracting event information from spanish legal text. For that, we define the following roles for an event. For each definition, 3 examples will be provided with the original sentece between simple backticks, an arrow and the correct role to extract.
- subject: Agent of the action, who performs the action.
{subject_train_data}
- event trigger: Action
{object_train_data}
- object: Receiver of the action.
{event_train_data}
- complement: Item which is handled in the relation.
{complement_train_data}

Given a large set of sentences in spanish from the legal domain, written between triple backticks, your objective is to extract the roles from the text written in Spanish following the next steps:
1. Identify each sentence in the corpus separated for new lines.
2. In each sentence detect a subject entity, an object entity, an event trigger and a complement following the previous definitions and examples.
3. The output of the task should be a list of dictionaries, each dictionary contains the following keys:
    - sentence: the sentence
    - subject: the subject entity
    - object: the object entity
    - event: the event trigger
    - complement: the complement

The set of sentences in spanish to use is the following:

```
{few_sentences}
```
'''

In [None]:
print(prompt)

In [None]:
few_response = get_completion(prompt)

In [None]:
print(few_response)

In [None]:
few_evaluation_predict = json.loads(few_response)

In [None]:
from utils.metrics import compute_average_similarity_score

print(f'''
Subject - {compute_average_similarity_score(few_evaluation_data, few_evaluation_predict, 'subject')}
Object - {compute_average_similarity_score(few_evaluation_data, few_evaluation_predict, 'object')}
Trigger - {compute_average_similarity_score(few_evaluation_data, few_evaluation_predict, 'event')}
Complement - {compute_average_similarity_score(few_evaluation_data, few_evaluation_predict, 'complement')}
''')
