In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import openai

from dotenv import load_dotenv
from utils.constants import STAGE_DIR, ANALYSIS_DIR
from utils.prompts import EXAMPLES_TEMPLATE
from utils.functions import get_examples

load_dotenv()

openai.api_key = os.getenv('OPENAI_KEY')

In [5]:
sentences = list(pd.read_csv(ANALYSIS_DIR / 'statutes-process' / 'statute-sentences.csv')['sentence_text'].values)

In [6]:
len(sentences)

1235

In [7]:
n = 4

splitted_sentences = [
    '\n\n'.join(s for s in sentences[i: i + n])
    for i in range(0, len(sentences), n)
]

prompts = [(
    'The set of sentences in spanish to use in the'
    ' event extraction task is the following:\n'
    f'```\n{s}\n```') for s in splitted_sentences
]

In [8]:
examples = [
    {'text': 'Expirada la duración del contrato para la formación y el aprendizaje, el trabajador no podrá ser contratado bajo esta modalidad por la misma o distinta empresa, salvo que la formación inherente al nuevo contrato tenga por objeto la obtención de distinta cualificación profesional.', 'subject': 'empresa', 'object': ['trabajador'], 'event': 'no podrá ser', 'complement': ['contratado'], 'subjectLabel': 'LegalAgent', 'objectLabel': 'LegalAgent', 'relationType': 'NoRight'},
    {'text': 'Aplicación de los límites de duración del contrato por obra o servicio determinados y al encadenamiento de contratos en las Administraciones Públicas.', 'subject': 'Administraciones Públicas', 'object': [], 'event': None, 'complement': [], 'subjectLabel': 'LegalEntity', 'relationType': 'Norelation'},
    {'text': 'La empresa conservará los registros a que se refiere este precepto durante cuatro años y permanecerán a disposición de las personas trabajadoras, de sus representantes legales y de la Inspección de Trabajo y Seguridad Social.', 'subject': 'empresa', 'object': [], 'event': 'conservará', 'complement': ['registros'], 'subjectLabel': 'LegalAgent', 'relationType': 'Duty'},
    {'text': 'La autoridad laboral dará traslado de la comunicación empresarial a la entidad gestora de las prestaciones por desempleo y recabará informe preceptivo de la Inspección de Trabajo y Seguridad Social sobre los extremos de dicha comunicación y sobre el desarrollo del periodo de consultas.', 'subject': 'autoridad laboral', 'object': ['entidad gestora'], 'event': 'dará traslado', 'complement': ['comunicación empresarial'], 'subjectLabel': 'LegalEntity', 'objectLabel': 'LegalEntity', 'relationType': 'Duty'},
    {'text': ' A los efectos de lo dispuesto en esta ley, se considera trabajo nocturno el realizado entre las diez de la noche y las seis de la mañana.', 'subject': None, 'object': [], 'event': None, 'complement': [], 'relationType': 'Norelation'}
]

In [9]:
from utils.functions import get_completion

responses = [None] * len(prompts)
failures = []
examples_select = get_examples(examples)

system = EXAMPLES_TEMPLATE % {
    'role_example': '\n\n'.join(examples_select['role']),
    'subject_type': '\n\n'.join(examples_select['subject-type']),
    'object_type': '\n\n'.join(examples_select['object-type']),
    'relation_type': '\n\n'.join(examples_select['relation-type']),
}

In [10]:
offset = 0
for i, prompt in enumerate(prompts[offset:]):
    try:
        responses[i + offset] = get_completion(system + '\n' + prompt)
    except:
        failures.append(i + offset)

In [11]:
responses

['[\n    {\n        "text": "Se considerarán relaciones laborales de carácter especial Cualquier otro trabajo que sea expresamente declarado como relación laboral de carácter especial por una ley",\n        "subject": "relaciones laborales de carácter especial",\n        "object": [],\n        "complement": [],\n        "event": "considerarán",\n        "subjectLabel": "LegalConcept",\n        "objectLabel": null,\n        "relationType": "Norelation"\n    },\n    {\n        "text": "Mediante convenio colectivo o, en su defecto, contrato individual, se optará entre abonar las horas extraordinarias en la cuantía que se fije, que en ningún caso podrá ser inferior al valor de la hora ordinaria, o compensarlas por tiempos equivalentes de descanso retribuido",\n        "subject": null,\n        "object": [],\n        "complement": [],\n        "event": null,\n        "subjectLabel": null,\n        "objectLabel": null,\n        "relationType": "Norelation"\n    },\n    {\n        "text": "En

In [12]:
sum(map(lambda x: x is not None, responses))

309

In [13]:
len(prompts)

309

In [None]:
b = splitted_sentences[216].split('\n\n')

In [None]:
a_s = "\n\n".join(b[:2])
a = get_completion(
    system \
    + f'\nThe set of sentences in spanish to use in the event extraction task is the following:\n\n```{a_s}```'
)
c_s = "\n\n".join(b[2:])
c = get_completion(
    system \
    + f'\nThe set of sentences in spanish to use in the event extraction task is the following:\n\n```{c_s}```'
)

In [None]:
d = json.dumps(json.loads(a) + json.loads(c))

In [None]:
responses[216] = d

In [16]:
import json

responses_json = [jr for r in responses for jr in json.loads(r)]

In [23]:
len(responses_json), len(sentences)

(1235, 1235)

In [18]:
from utils.constants import ANALYSIS_DIR

with open(ANALYSIS_DIR / 'statutes-events' / 'status-events.json', 'w') as file:
    json.dump(responses_json, file)

In [24]:
import pandas as pd

summary = pd.Series(
    [0] * 12,
    index=['Subjects', 'Objects', 'Triggers', 'Complements', 'LegalAgent', 'LegalEntity', 'LegalConcept', 'Duty', 'Right', 'NoRight', 'Privilege', 'Norelation'],
    name='Count'
)

for r in responses_json:
    summary['Subjects'] += 1 if r['subject'] else 0
    summary['Objects'] += len(r['object']) if r['object'] else 0
    summary['Triggers'] += 1 if r['event'] else 0
    summary['Complements'] += len(r['complement']) if r['complement'] else 0
    if r['subjectLabel']:
        summary[r['subjectLabel']] += 1
    if r['objectLabel']:
        summary[r['objectLabel']] += len(r['object'])
    if r['relationType'] == 'NoRelation':
        r['relationType'] = 'Norelation'
    summary[r['relationType']] += 1

In [25]:
summary

Subjects        1141
Objects          232
Triggers        1158
Complements      491
LegalAgent       476
LegalEntity      246
LegalConcept     648
Duty             586
Right            203
NoRight          152
Privilege        103
Norelation       191
Name: Count, dtype: int64

In [26]:
def different_entities_count(entities_set, *new_entities):
    previous_count = len(entities_set)
    _new_entities = [entity if not isinstance(entity, list) else entity[0] for entity in new_entities if entity]
    entities_set.update(_new_entities)
    return len(entities_set) - previous_count

In [27]:
import pandas as pd

summary = pd.Series(
    [0] * 8,
    index=['Subjects', 'Objects', 'Triggers', 'Complements', 'Total' ,'LegalAgent', 'LegalEntity', 'LegalConcept'],
    name='Count'
)

entities = {
    'Subjects': set(),
    'Objects': set(),
    'Triggers': set(),
    'Complements': set(),
    'Total': set(),
}

for r in responses_json:
    subject_count = different_entities_count(entities['Subjects'], r['subject'])
    object_count = different_entities_count(entities['Objects'], *r['object'])
    summary['Subjects'] += subject_count
    summary['Objects'] += object_count
    summary['Triggers'] += different_entities_count(entities['Triggers'], r['event'])
    summary['Complements'] += different_entities_count(entities['Complements'], *r['complement'])

    summary['Total'] += different_entities_count(entities['Total'], r['subject'], *r['complement'], *r['object'], r['event'])

    if r['subjectLabel']:
        summary[r['subjectLabel']] += subject_count
    if r['objectLabel']:
        summary[r['objectLabel']] += object_count

In [28]:
summary

Subjects         535
Objects          139
Triggers         713
Complements      432
Total           1714
LegalAgent       140
LegalEntity      107
LegalConcept     426
Name: Count, dtype: int64

In [45]:
event_extraction_dict = {
    'event_mention': [],
    'event_subject': [],
    'event_subject_id': [],
    'event_subject_type': [],
    'event_object': [],
    'event_object_id': [],
    'event_object_type': [],
    'event_complement': [],
    'event_complement_id': [],
    'event_trigger': [],
    'event_trigger_id': [],
    'relation_type': [],
    'relation_id': []
}

counters = {
    'Duty': 1,
    'Right': 1,
    'NoRight': 1,
    'Privilege': 1,
}

entities['noTriggers'] = entities['Subjects'].union(entities['Objects']).union(entities['Complements'])

entities_id = {
    'Triggers': sorted(list(entities['Triggers'])),
    'noTriggers': sorted(list(entities['noTriggers']))
}

In [40]:
def entities_zip(sentence):
    if len(sentence['object']) <= 1 and len(sentence['complement']) <= 1:
        return [(
            sentence['subject'],
            sentence['object'][0] if sentence['object'] else None,
            sentence['complement'][0] if sentence['complement'] else None,
            sentence['event'],
        )]
    zipped_entities = []
    if len(sentence['object']) > 1 and len(sentence['complement']) <= 1:
        for obj in sentence['object']:
            zipped_entities.append((
                sentence['subject'],
                obj,
                sentence['complement'][0] if sentence['complement'] else None,
                sentence['event'],
            ))
    elif len(sentence['object']) > 1 and len(sentence['complement']) > 1:
        for obj in sentence['object']:
            for comp in sentence['complement']:
                zipped_entities.append((
                    sentence['subject'],
                    obj,
                    comp,
                    sentence['event'],
                ))
    else:
        for comp in sentence['complement']:
            zipped_entities.append((
                sentence['subject'],
                sentence['object'][0] if sentence['object'] else None,
                comp,
                sentence['event'],
            ))
    return zipped_entities

In [46]:
for response_sentence in responses_json:
    zipped_entities = entities_zip(response_sentence)
    for subj, obj, comp, trig in zipped_entities:
        event_extraction_dict['event_mention'] += [response_sentence['text']]
        event_extraction_dict['event_subject'] += [subj]
        event_extraction_dict['event_object'] += [obj]
        event_extraction_dict['event_complement'] += [comp]
        event_extraction_dict['event_trigger'] += [trig]

        event_extraction_dict['event_subject_type'] += [response_sentence['subjectLabel'] if response_sentence['subjectLabel'] else None]
        event_extraction_dict['event_object_type'] += [response_sentence['objectLabel'] if response_sentence['objectLabel'] else None]

        event_extraction_dict['event_subject_id'] += [f'argument{str(entities_id["noTriggers"].index(subj)).zfill(4)}' if subj else None]
        event_extraction_dict['event_object_id'] += [f'argument{str(entities_id["noTriggers"].index(obj)).zfill(4)}' if obj else None]
        event_extraction_dict['event_complement_id'] += [f'argument{str(entities_id["noTriggers"].index(comp)).zfill(4)}' if comp else None]
        event_extraction_dict['event_trigger_id'] += [f'trigger{str(entities_id["Triggers"].index(trig)).zfill(4)}' if trig else None]
        
        event_extraction_dict['relation_type'] += [response_sentence['relationType']]

        if response_sentence['relationType'].lower() != 'norelation':
            event_extraction_dict['relation_id'] += \
                [f'{response_sentence["relationType"].lower()}{str(counters[response_sentence["relationType"]]).zfill(4)}']
        else:
            event_extraction_dict['relation_id'] += [None]

In [48]:
event_extraction_df = pd.DataFrame(event_extraction_dict)

In [49]:
event_extraction_df

Unnamed: 0,event_mention,event_subject,event_subject_id,event_subject_type,event_object,event_object_id,event_object_type,event_complement,event_complement_id,event_trigger,event_trigger_id,relation_type,relation_id
0,Se considerarán relaciones laborales de caráct...,relaciones laborales de carácter especial,argument0836,LegalConcept,,,,,,considerarán,trigger0078,Norelation,
1,"Mediante convenio colectivo o, en su defecto, ...",,,,,,,,,,,Norelation,
2,"En otro caso, a una comisión representativa qu...",comisión representativa,argument0228,LegalConcept,,,,,,constituirá,trigger0082,Duty,duty0001
3,"El trabajador, previo aviso y justificación, p...",trabajador,argument0925,LegalAgent,,,,,,podrá ausentarse,trigger0362,Right,right0001
4,"A los efectos de esta ley, serán empresarios t...",personas,argument0755,LegalAgent,empresas usuarias,argument0461,LegalEntity,,,serán,trigger0624,Right,right0001
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,"El Fondo de Garantía Salarial, organismo autón...",Fondo de Garantía Salarial,argument0041,LegalEntity,trabajadores,argument0933,LegalAgent,,,abonará,trigger0033,Duty,duty0001
1504,Resultará de aplicación a lo establecido en el...,Gobierno,argument0042,LegalEntity,,,,,,podrá establecer,trigger0377,Privilege,privilege0001
1505,Los convenios colectivos sectoriales estatales...,convenios colectivos sectoriales estatales y d...,argument0309,LegalConcept,,,,,,podrán identificar,trigger0434,Privilege,privilege0001
1506,Podrán celebrarse contratos de duración determ...,,,,,,,,,Podrán celebrarse,trigger0008,Norelation,


In [51]:
event_extraction_df.to_csv(ANALYSIS_DIR / 'statutes-events' / 'statute-events-dataset.csv', index=False)