# Author: ddukic

In [1]:
from datasets import load_dataset

dataset = load_dataset("conll2003", split="train")

In [79]:
def extract_nonzero_spans(tokens, labels, id2label):
    spans = []
    current_span = None
    label_previous = None

    for token, label in zip(tokens, labels):
        if label != 0:
            if current_span is not None and (
                label == label_previous + 1 or label == label_previous
            ):
                current_span.append((token, id2label[label]))
            else:
                if current_span is not None:
                    spans.append(
                        " ".join([x[0] for x in current_span])
                        + ":"
                        + current_span[0][1]
                    )
                current_span = [(token, id2label[label])]
                label_previous = label
        elif current_span is not None:
            spans.append(
                " ".join([x[0] for x in current_span]) + ":" + current_span[0][1]
            )
            current_span = None
            label_previous = None

    if current_span is not None:
        spans.append(" ".join([x[0] for x in current_span]) + ":" + current_span[0][1])

    return ";".join(spans)


def extract_bio_spans(tokens, labels):
    spans = []
    current_span = None
    label_previous = None

    for token, label in zip(tokens, labels):
        if label != "O":
            tag, phrase_type = label.split("-")
            if current_span is not None and (
                tag == "I" and phrase_type == label_previous
            ):
                current_span.append((token, phrase_type))
            else:
                if current_span is not None:
                    spans.append(
                        " ".join([x[0] for x in current_span])
                        + ":"
                        + current_span[0][1]
                    )
                current_span = [(token, phrase_type)]
                label_previous = phrase_type
        elif current_span is not None:
            spans.append(
                " ".join([x[0] for x in current_span]) + ":" + current_span[0][1]
            )
            current_span = None
            label_previous = None

    if current_span is not None:
        spans.append(" ".join([x[0] for x in current_span]) + ":" + current_span[0][1])

    return ";".join(spans)

In [82]:
tokens = dataset["tokens"][0]
labels_ner = dataset["ner_tags"][0]
labels_chunk = dataset["chunk_tags"][0]

id2label_ner = {
    1: "person",
    2: "person",
    3: "organization",
    4: "organization",
    5: "location",
    6: "location",
    7: "miscellaneous",
    8: "miscellaneous",
}

id2label_chunk = {
    1: "adjective phrase",
    2: "adjective phrase",
    3: "adverb phrase",
    4: "adverb phrase",
    5: "conjunction phrase",
    6: "conjunction phrase",
    7: "interjection",
    8: "interjection",
    9: "list marker",
    10: "list marker",
    11: "noun phrase",
    12: "noun phrase",
    13: "prepositional phrase",
    14: "prepositional phrase",
    15: "particles",
    16: "particles",
    17: "subordinated clause",
    18: "subordinated clause",
    19: "unlike coordinated phrase",
    20: "unlike coordinated phrase",
    21: "verb phrase",
    22: "verb phrase",
}

In [74]:
import spacy
from spacy import displacy
from spacy.tokens import Doc

nlp = spacy.load("en_core_web_lg")


def viz_text(tokens, tags):
    assert len(tokens) == len(tags)

    doc = Doc(nlp.vocab, words=tokens, ents=tags)

    displacy.render(
        doc,
        style="ent",
        options={
            "ents": list(set(id2label_chunk.values())),
            "colors": {x: "#ff6961" for x in list(set(id2label_chunk.values()))},
        },
    )

In [75]:
viz_text(
    tokens,
    [
        "B-noun phrase",
        "I-noun phrase",
        "I-noun phrase",
        "B-verb phrase",
        "B-noun phrase",
        "I-noun phrase",
        "O",
        "B-noun phrase",
        "I-noun phrase",
        "O",
        "B-prepositional phrase",
        "B-noun phrase",
        "I-noun phrase",
        "B-prepositional phrase",
        "O",
        "B-verb phrase",
        "O",
        "B-noun phrase",
        "B-verb phrase",
        "O",
        "O",
        "B-noun phrase",
        "B-noun phrase",
        "B-verb phrase",
        "B-prepositional phrase",
        "B-noun phrase",
        "I-noun phrase",
        "B-prepositional phrase",
        "B-noun phrase",
        "I-noun phrase",
        "I-noun phrase",
        "B-prepositional phrase",
        "B-noun phrase",
        "I-noun phrase",
        "O",
    ],
)

In [78]:
labels_chunk[21:23]

[11, 11]

In [83]:
extract_bio_spans(
    tokens,
    [
        "B-noun phrase",
        "B-verb phrase",
        "B-noun phrase",
        "I-noun phrase",
        "B-verb phrase",
        "I-verb phrase",
        "B-noun phrase",
        "I-noun phrase",
        "O",
    ],
)

'EU:noun phrase;rejects:verb phrase;German call:noun phrase;to boycott:verb phrase;British lamb:noun phrase'

In [84]:
extract_bio_spans(
    tokens,
    [
        "B-organization",
        "O",
        "B-miscellaneous",
        "O",
        "O",
        "O",
        "B-miscellaneous",
        "O",
        "O",
    ],
)

'EU:organization;German:miscellaneous;British:miscellaneous'

In [77]:
extract_nonzero_spans(tokens, labels_chunk, id2label_chunk)

'A Florida restaurant:noun phrase;paid:verb phrase;10,925 pounds:noun phrase;$ 16,935:noun phrase;for:prepositional phrase;the draft:noun phrase;of:prepositional phrase;Ai:verb phrase;no:noun phrase;telling:verb phrase;which Hendrix:noun phrase;penned:verb phrase;on:prepositional phrase;a piece:noun phrase;of:prepositional phrase;London hotel stationery:noun phrase;in:prepositional phrase;late 1966:noun phrase'

In [68]:
extract_nonzero_spans(tokens, labels_ner, id2label_ner)

"Florida:location;Ai n't no telling:miscellaneous;Hendrix:person;London:location"

In [10]:
spans.split(";")

['Japan:location', 'Asian Cup:miscellaneous', 'Syria:location']

In [127]:
def spans_to_bio_tags(spans, tokens):
    entities = spans.split(";")

    bio_tags = ["O"] * len(tokens)

    if spans.strip() == "":
        return bio_tags

    start_index = -1

    for entity in entities:
        entity = entity.lstrip().rstrip()
        entity_tokens = [
            x.lstrip().rstrip() for x in entity.rsplit(":", 1)[0].split(" ")
        ]
        entity_type = entity.rsplit(":", 1)[1]

        for i in range(start_index + 1, len(tokens)):
            if tokens[i : i + len(entity_tokens)] == entity_tokens:
                start_index = i
                break

        if start_index != -1:
            bio_tags[start_index] = "B-" + entity_type
            for i in range(start_index + 1, start_index + len(entity_tokens)):
                bio_tags[i] = "I-" + entity_type

    return bio_tags

In [123]:
"McLaren 1:54.342:noun phrase".rsplit(":", 1)

['McLaren 1:54.342', 'noun phrase']

In [115]:
dataset["tokens"][14]

['Sheep',
 'have',
 'long',
 'been',
 'known',
 'to',
 'contract',
 'scrapie',
 ',',
 'a',
 'brain-wasting',
 'disease',
 'similar',
 'to',
 'BSE',
 'which',
 'is',
 'believed',
 'to',
 'have',
 'been',
 'transferred',
 'to',
 'cattle',
 'through',
 'feed',
 'containing',
 'animal',
 'waste',
 '.']

In [128]:
spans = "Magnificent:noun phrase;':noun phrase;said:verb phrase;Fitzpatrick:noun phrase;New Zealand 's:noun phrase;most:adjective phrase;capped player:noun phrase;the world 's:noun phrase;most:adjective phrase;capped:noun phrase;forward:adverb phrase"

spans_to_bio_tags(spans, dataset["tokens"][5819])

['B-noun phrase',
 'O',
 'B-noun phrase',
 'O',
 'B-verb phrase',
 'B-noun phrase',
 'O',
 'B-noun phrase',
 'I-noun phrase',
 'I-noun phrase',
 'B-adjective phrase',
 'B-noun phrase',
 'I-noun phrase',
 'O',
 'B-noun phrase',
 'I-noun phrase',
 'I-noun phrase',
 'B-adjective phrase',
 'B-noun phrase',
 'B-adverb phrase',
 'O']

In [126]:
spans = "Magnificent:noun phrase;':noun phrase;said:verb phrase;Fitzpatrick:noun phrase;New Zealand 's:noun phrase;most:adjective phrase;capped player:noun phrase;the world 's:noun phrase;most:adjective phrase;capped:noun phrase;forward:adverb phrase"

spans_to_bio_tags(spans, dataset["tokens"][5819])

['B-noun phrase',
 'O',
 'B-noun phrase',
 'O',
 'B-verb phrase',
 'B-noun phrase',
 'O',
 'B-noun phrase',
 'I-noun phrase',
 'I-noun phrase',
 'B-adjective phrase',
 'B-noun phrase',
 'I-noun phrase',
 'O',
 'B-noun phrase',
 'I-noun phrase',
 'I-noun phrase',
 'B-adjective phrase',
 'B-noun phrase',
 'B-adverb phrase',
 'O']