<a href="https://colab.research.google.com/github/franklinwillemen/Sherlock-Hemlock-Graph-Based-NER/blob/main/data_build_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
import transformers as t
import datasets
from datasets import load_dataset

import torch
import torch_geometric
from torch_geometric.data import Data

import numpy as np
import pandas as pd

import spacy
from spacy.util import minibatch

from sklearn.preprocessing import LabelEncoder

from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

In [None]:
#from spacy import displacy

#doc = nlp_large('je suis travailleur salarié (e) puis je refuser de faire des heures supplémentaires ou de travailler de nuit')
#displacy.serve(doc, style='dep')

In [98]:
def extract_features(docs):
    pos, dep, heads, tokens = [], [], [], []
    
    for doc in docs:
        pos_doc, dep_doc, heads_doc, tokens_doc = [], [], [], []
        for sent in doc.sents:
            pos_sent = [token.pos_ for token in sent]
            dep_sent = [token.dep_ for token in sent]
            heads_sent = [token.head.i - sent.start for token in sent]
            tokens_sent = [token.text for token in sent]
            pos_doc.append(pos_sent)
            dep_doc.append(dep_sent)
            heads_doc.append(heads_sent)
            tokens_doc.append(tokens_sent)
        pos.append(pos_doc)
        dep.append(dep_doc)
        heads.append(heads_doc)
        tokens.append(tokens_doc)
    
    return pos, dep, heads, tokens

def process_with_spacy(df, nlp, column, batch_size=32):
    pos, dep, heads, tokens = [], [], [], []
    
    for i in range(0, len(df), batch_size):
        batch_text = df[column][i:i+batch_size].tolist()
        
        # Process batch with Spacy
        docs = list(nlp.pipe(batch_text))
        
        # Extract POS, DEP, HEADS, and tokens
        pos_batch, dep_batch, heads_batch, tokens_batch = extract_features(docs)
        pos.extend(pos_batch)
        dep.extend(dep_batch)
        heads.extend(heads_batch)
        tokens.extend(tokens_batch)
        
    df.loc[:, 'pos'] = pd.Series(pos)
    df.loc[:, 'dep'] = pd.Series(dep)
    df.loc[:, 'heads'] = pd.Series(heads)
    df.loc[:, 'tokens'] = pd.Series(tokens)

    return df

def normalize_text(text, nlp):
    # Parse the sentence using the loaded 'fr' model object `nlp`
    doc = nlp(text)

    # Extract the lemma for each token and join
    normalized_text = " ".join(token.text.lower() for token in doc if not token.is_punct)

    return normalized_text

def custom_tokenizer(nlp):
    infixes = nlp.Defaults.infixes + [r'\(([^)]+)\)']  # Added pattern to split off parenthetical suffixes
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

In [85]:
# Processing using Spacy pipelines
nlp_large = spacy.load("fr_dep_news_trf")
nlp_small = spacy.load("fr_core_news_sm")
nlp_small.tokenizer = custom_tokenizer(nlp_small)

In [50]:
#articles = load_dataset("maastrichtlawtech/bsard", data_files="articles_fr.csv", split="train")
#articles = pd.DataFrame(articles.to_pandas())
#articles.head()

# articles are already processed and downloadable from huggingface
df = pd.read_csv("../../local_datasets/bsard_extra/bsard_articles_preprocessed.csv")

In [52]:
q_train = load_dataset("maastrichtlawtech/bsard", data_files="questions_fr_train.csv", split="train").to_pandas()
q_test = load_dataset("maastrichtlawtech/bsard", data_files="questions_fr_test.csv", split="train").to_pandas()

Found cached dataset csv (C:/Users/Frank/.cache/huggingface/datasets/maastrichtlawtech___csv/maastrichtlawtech--bsard-d4b4993551724e5a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Found cached dataset csv (C:/Users/Frank/.cache/huggingface/datasets/maastrichtlawtech___csv/maastrichtlawtech--bsard-41a15be090989dc9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Found cached dataset csv (C:/Users/Frank/.cache/huggingface/datasets/maastrichtlawtech___csv/maastrichtlawtech--bsard-dc9bbe4c96b2ba1d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [99]:
q_train['normalized_question'] = q_train['question'].apply(lambda x: normalize_text(x, nlp_small))
q_test['normalized_question'] = q_test['question'].apply(lambda x: normalize_text(x, nlp_small))

In [106]:
q_train = process_with_spacy(q_train, nlp_large, "normalized_question")
q_test = process_with_spacy(q_test, nlp_large, "normalized_question")

In [112]:
q_train.to_csv("../../local_datasets/bsard_extra/bsard_expert_questions_train_preprocessed.csv")
q_test.to_csv("../../local_datasets/bsard_extra/bsard_expert_questions_test_preprocessed.csv")

In [113]:
q_synth = load_dataset("maastrichtlawtech/bsard", data_files="questions_fr_synthetic.csv", split="train").to_pandas()

Found cached dataset csv (C:/Users/Frank/.cache/huggingface/datasets/maastrichtlawtech___csv/maastrichtlawtech--bsard-dc9bbe4c96b2ba1d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [119]:
q_synth = process_with_spacy(q_synth, nlp_large, "question")

In [123]:
q_synth.iloc[2]["question"]

'quel droit du gouvernement élargie la matière visée à la constitution.'

In [124]:
q_synth.to_csv("../../local_datasets/bsard_extra/bsard_synthetic_questions_preprocessed.csv")

In [10]:
import gensim

In [9]:

gensim._load_fasttest_format("cc.fr.")

# Get the vector for a word
vector = model.wv['maison']

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\Frank\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\gensim\\test\\test_data\\cc.fr.300.bin'