<a href="https://colab.research.google.com/github/franklinwillemen/Sherlock-Hemlock-Graph-Based-NER/blob/main/data_build_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import transformers as t
import datasets
from datasets import load_dataset

import torch
import torch_geometric
from torch_geometric.data import Data

import numpy as np
import pandas as pd

import spacy
from spacy.util import minibatch

import flair
from flair.data import Sentence
from flair.models import SequenceTagger

from sklearn.preprocessing import LabelEncoder

In [2]:
""" articles = load_dataset("maastrichtlawtech/bsard", data_files="articles_fr.csv", split="train")
df_art = pd.DataFrame(articles.to_pandas())
df_art.head() """

In [3]:
articles.columns

Index(['id', 'reference', 'article', 'law_type', 'code', 'book', 'part', 'act',
       'chapter', 'section', 'subsection', 'description',
       'preprocessed_article', 'word_count', 'sentence_count'],
      dtype='object')

In [4]:
snlp = spacy.load("fr_dep_news_trf")
fnlp = SequenceTagger.load("flair/ner-french")

2023-04-29 00:08:07,770 SequenceTagger predicts: Dictionary with 19 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-MISC, B-MISC, E-MISC, I-MISC, S-ORG, B-ORG, E-ORG, I-ORG, <START>, <STOP>


In [52]:
def extract_pos_dep_head(docs):
    pos, dep, heads = [], [], []
    
    for doc in docs:
        pos_doc, dep_doc, heads_doc = [], [], []
        for sent in doc.sents:
            pos_sent = [token.pos_ for token in sent]
            dep_sent = [token.dep_ for token in sent]
            heads_sent = [token.head.i - sent.start for token in sent]
            pos_doc.append(pos_sent)
            dep_doc.append(dep_sent)
            heads_doc.append(heads_sent)
        pos.append(pos_doc)
        dep.append(dep_doc)
        heads.append(heads_doc)
    
    return pos, dep, heads

def process_with_spacy(df, nlp, batch_size=32):
    pos, dep, heads = [], [], []
    
    for i in range(0, len(df), batch_size):
        batch_text = df['article'][i:i+batch_size].tolist()
        
        # Process batch with Spacy
        docs = list(nlp.pipe(batch_text))
        
        # Extract POS, DEP, HEADS
        pos_batch, dep_batch, heads_batch = extract_pos_dep_head(docs)
        pos.extend(pos_batch)
        dep.extend(dep_batch)
        heads.extend(heads_batch)
        
    df.loc[:, 'pos'] = pd.Series(pos)
    df.loc[:, 'dep'] = pd.Series(dep)
    df.loc[:, 'heads'] = pd.Series(heads)
    
    return df

In [53]:
df = pd.read_csv("../local_datasets/bsard_extra/bsard_articles_extra_spacy.csv")

In [54]:
df

Unnamed: 0,id,reference,article,code,book,act,chapter,section,subsection,word_count,sentence_count,pos,dep,heads
0,1,"Art. 1.1.1, Code Bruxellois de l'Air, du Clima...",Le présent Code règle une matière visée à l'ar...,"Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,Généralités,,,,11,1,"[['DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'NOUN',...","[['det', 'amod', 'nsubj', 'ROOT', 'det', 'obj'...","[[2, 2, 3, 3, 5, 3, 5, 9, 9, 6, 9, 13, 13, 9, 3]]"
1,2,"Art. 1.1.2, Code Bruxellois de l'Air, du Clima...",Le présent Code transpose en Région de Bruxell...,"Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,Généralités,,,,248,4,"[['DET', 'ADJ', 'NOUN', 'VERB', 'ADP', 'NOUN',...","[['det', 'amod', 'nsubj', 'ROOT', 'case', 'obl...","[[2, 2, 3, 3, 5, 3, 7, 5, 5, 5, 11, 3, 11, 3],..."
2,3,"Art. 1.2.1, Code Bruxellois de l'Air, du Clima...",Le présent Code poursuit les objectifs suivant...,"Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,Objectifs,,,,136,2,"[['DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'NOUN',...","[['det', 'amod', 'nsubj', 'ROOT', 'det', 'obj'...","[[2, 2, 3, 3, 5, 3, 5, 3], [2, 2, 2, 4, 2, 4, ..."
3,4,"Art. 1.3.1, Code Bruxellois de l'Air, du Clima...","Au sens du présent Code, il faut entendre par ...","Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,Définitions,,,,293,1,"[['ADP', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT'...","[['case', 'obl:mod', 'case', 'amod', 'nmod', '...","[[1, 7, 4, 4, 1, 7, 7, 7, 7, 10, 8, 7, 13, 11,..."
4,5,"Art. 1.4.1, Code Bruxellois de l'Air, du Clima...","Le plan régional Air-Climat-énergie, ci-après ...","Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,Plan régional air-climat-énergie,Contenu et portée du plan,,,77,4,"[['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN', 'PRO...","[['det', 'nsubj', 'amod', 'nmod', 'nmod', 'nmo...","[[1, 18, 1, 1, 1, 1, 1, 1, 1, 12, 12, 12, 1, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22628,22629,"Art. 194, La Constitution (Titre VII)",La ville de Bruxelles est la capitale de la Be...,La Constitution,,Dispositions generales.,,,,9,1,"[['DET', 'NOUN', 'ADP', 'PROPN', 'AUX', 'DET',...","[['det', 'nsubj', 'case', 'nmod', 'cop', 'det'...","[[1, 6, 3, 1, 6, 6, 6, 9, 9, 6, 12, 12, 6, 14,..."
22629,22630,"Art. 195, La Constitution (Titre VIII)",Le pouvoir législatif fédéral a le droit de dé...,La Constitution,,De la revision de la constitution.,,,,753,8,"[['DET', 'NOUN', 'ADJ', 'ADJ', 'VERB', 'DET', ...","[['det', 'nsubj', 'amod', 'amod', 'ROOT', 'det...","[[1, 4, 1, 1, 4, 6, 4, 8, 6, 12, 12, 12, 8, 12..."
22630,22631,"Art. 196, La Constitution (Titre VIII)",Aucune révision de la Constitution ne peut êtr...,La Constitution,,De la revision de la constitution.,,,,23,1,"[['DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADV', ...","[['det', 'nsubj', 'case', 'det', 'nmod', 'advm...","[[1, 6, 4, 4, 1, 6, 6, 8, 6, 10, 8, 12, 10, 14..."
22631,22632,"Art. 197, La Constitution (Titre VIII)","Pendant une régence, aucun changement ne peut ...",La Constitution,,De la revision de la constitution.,,,,15,1,"[['ADP', 'DET', 'NOUN', 'PUNCT', 'DET', 'NOUN'...","[['case', 'det', 'obl:mod', 'punct', 'det', 'n...","[[2, 2, 7, 7, 5, 7, 7, 7, 9, 7, 12, 12, 9, 18,..."
