# Author: ddukic

## Spacy RDRR

In [4]:
import spacy
from spacy.tokenizer import Tokenizer
import re

# Load the language model
nlp = spacy.load("en_core_web_lg")
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r"\S+").match)

In [5]:
def count_left_right_deps(
    tags=["B-Person", "I-Person", "O", "O", "O", "B-Location", "O"],
    sent_start="Barack Obama was born in Hawaii .",
):
    left_count = 0
    right_count = 0

    doc = nlp(sent_start)

    for word, tag in zip(doc, tags):
        # print(word, word.i, word.head, word.head.i, [w for w in word.children])
        if word.dep_ != "ROOT":
            if word.head.i > word.i and tag != "O":
                right_count += 1
            elif word.head.i < word.i and tag != "O":
                left_count += 1
            else:
                if tag != "O":
                    print("Something is wrong!")

        for child in word.children:
            if child.i > word.i and tag != "O":
                right_count += 1
            elif child.i < word.i and tag != "O":
                left_count += 1
            else:
                if tag != "O":
                    print("Something is wrong!")

    return left_count, right_count

In [6]:
count_left_right_deps()

(2, 2)

In [7]:
import sys

sys.path.append("../baselines/")
from transformers import AutoTokenizer
from dataset import TokenClassificationDataset
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)


for ds_name in ["conll2003", "conll2003chunk", "ace-tc", "absa-restaurants"]:
    dataset = TokenClassificationDataset(ds_name, tokenizer, "train")

    left_global, right_global = 0, 0

    for tags, sentence in tqdm(
        zip(dataset.labels, dataset.tokens), total=len(dataset.labels)
    ):
        l, r = count_left_right_deps(tags, " ".join(sentence))
        left_global += l
        right_global += r

    print(ds_name, right_global / (left_global + right_global))

100%|██████████| 14041/14041 [00:50<00:00, 278.31it/s]


conll2003 0.593070475198063


100%|██████████| 14041/14041 [00:49<00:00, 284.78it/s]


conll2003chunk 0.5180003314872921


100%|██████████| 14672/14672 [00:55<00:00, 263.75it/s]


ace-tc 0.4109069886947585


100%|██████████| 2737/2737 [00:10<00:00, 272.39it/s]

absa-restaurants 0.3962785556374355



