<a href="https://colab.research.google.com/github/franklinwillemen/Sherlock-Hemlock-Graph-Based-NER/blob/main/data_build_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import transformers as t
import datasets
from datasets import load_dataset

import torch
import torch_geometric
from torch_geometric.data import Data

import numpy as np
import pandas as pd

import spacy
from spacy.util import minibatch

import flair
from flair.data import Sentence
from flair.models import SequenceTagger

from sklearn.preprocessing import LabelEncoder

In [2]:
""" articles = load_dataset("maastrichtlawtech/bsard", data_files="articles_fr.csv", split="train")
df_art = pd.DataFrame(articles.to_pandas())
df_art.head() """

articles = pd.read_csv("bsard_articles_extra.csv",index_col=0)

In [3]:
articles.columns

Index(['id', 'reference', 'article', 'law_type', 'code', 'book', 'part', 'act',
       'chapter', 'section', 'subsection', 'description',
       'preprocessed_article', 'word_count', 'sentence_count'],
      dtype='object')

In [4]:
snlp = spacy.load("fr_dep_news_trf")
fnlp = SequenceTagger.load("flair/ner-french")

2023-04-29 00:08:07,770 SequenceTagger predicts: Dictionary with 19 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-MISC, B-MISC, E-MISC, I-MISC, S-ORG, B-ORG, E-ORG, I-ORG, <START>, <STOP>


In [52]:
def extract_pos_dep_head(docs):
    pos, dep, heads = [], [], []
    
    for doc in docs:
        pos_doc, dep_doc, heads_doc = [], [], []
        for sent in doc.sents:
            pos_sent = [token.pos_ for token in sent]
            dep_sent = [token.dep_ for token in sent]
            heads_sent = [token.head.i - sent.start for token in sent]
            pos_doc.append(pos_sent)
            dep_doc.append(dep_sent)
            heads_doc.append(heads_sent)
        pos.append(pos_doc)
        dep.append(dep_doc)
        heads.append(heads_doc)
    
    return pos, dep, heads

def process_with_spacy(df, nlp, batch_size=32):
    pos, dep, heads = [], [], []
    
    for i in range(0, len(df), batch_size):
        batch_text = df['article'][i:i+batch_size].tolist()
        
        # Process batch with Spacy
        docs = list(nlp.pipe(batch_text))
        
        # Extract POS, DEP, HEADS
        pos_batch, dep_batch, heads_batch = extract_pos_dep_head(docs)
        pos.extend(pos_batch)
        dep.extend(dep_batch)
        heads.extend(heads_batch)
        
    df.loc[:, 'pos'] = pd.Series(pos)
    df.loc[:, 'dep'] = pd.Series(dep)
    df.loc[:, 'heads'] = pd.Series(heads)
    
    return df