In [1]:
import spacy
import en_core_web_trf

In [11]:
BASE_URL = "http://127.0.0.1:8000"

In [2]:
nlp = en_core_web_trf.load()

## Useful spacy knowledge

- Can use token.is\_\* and token.like\_\* properties to create a `is_relevant_token(token: Token) -> bool` method
- `spacy.explain("some label")` has explanation for most labels
- Creating an own doc (could be useful for creating the `context_value` string)

```py
# Import the Doc class
from spacy.tokens import Doc

# Desired text: "spaCy is cool!"
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
```

If you need to process a lot of texts and create a lot of Doc objects in a row, the nlp.pipe method can speed this up significantly.

It processes the texts as a stream and yields Doc objects.

It is much faster than just calling nlp on each text, because it batches up the texts.

nlp.pipe is a generator that yields Doc objects, so in order to get a list of docs, remember to call the list method around it.

```py
with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
```

A list of [text, context] examples is available as the variable DATA. The texts are quotes from famous books, and the contexts dictionaries with the keys "author" and "book".

Use the set_extension method to register the custom attributes "author" and "book" on the Doc, which default to None.
Process the [text, context] pairs in DATA using nlp.pipe with as_tuples=True.
Overwrite the doc._.book and doc._.author with the respective info passed in as the context.

```py
from spacy.tokens import Doc

Doc.set_extension("id", default=None)
Doc.set_extension("page_number", default=None)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]
```

- Use nlp.make_doc to turn a text into a Doc object, rather than doc. This only tokenises the text

In [22]:
import requests
from dbtypes import LemmaId


def get_lemma_id(lemma: str) -> LemmaId:
    r = requests.get(f"{BASE_URL}/lemma/get_lemma_id/{lemma}")
    return LemmaId(r.json()) if r.status_code == 200 else LemmaId(-1)

1


In [23]:
import requests

from dbtypes import Lemma

def add_lemma(lemma: Lemma) -> LemmaId:
    r = requests.post(
        f"{BASE_URL}/lemma/add_lemma",
        json=lemma.to_dict(),
    )
    assert r.status_code == 200
    assert LemmaId(r.json()) != -1
    return LemmaId(r.json())

4

In [4]:
from collections import defaultdict
from itertools import islice, combinations

from dbtypes import StatusId, UposTag

context_size_lines = 5
parse_into_base_vocab = False

prefixes = nlp.Defaults.prefixes + [r"""^-+"""]  # type: ignore
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

suffixes = nlp.Defaults.suffixes + [
    r"""-+$""",
]  # type: ignore
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search

pipelines = [
    "transformer",
    "tagger",
    "parser",
    "attribute_ruler",
    "lemmatizer",
    "ner",
]
pipeline_combinations = []
for r in range(1, len(pipelines) + 1):
    pipeline_combinations.extend(list(combinations(pipelines, r)))

# default dict with empty list
stats = defaultdict(list)

with open("assets/reference-vocabulary/vocabulary.base.txt", "r") as fvb:
    existing_base_vocab = set(fvb.read().split())
new_base_vocab = set()

with open("assets/dev-samples/harry-potter.content.txt", "r") as f:
    i = 0
    while True:
        print(i)
        i += 1
        context = " ".join([l.strip() for l in islice(f, context_size_lines)])
        if not context or i == 1:
            break

        # Treat each batch as an own document
        # First, tokenise the batch
        doc = nlp.make_doc(context)
        context_tokens = [(t.text, t.whitespace_) for t in doc]

        # Now, lowercase and remove stopwords
        processable_tokens = [
            t.text.lower() for t in doc if not nlp.vocab[t.text].is_stop
        ]

        with nlp.select_pipes(
            enable=(
                "transformer",
                "tagger",
                "attribute_ruler",
                "lemmatizer",
            )
        ):
            doc = nlp(context)

            relevant_pos = [
                UposTag.NOUN.value,
                UposTag.VERB.value,
                UposTag.ADJ.value,
                UposTag.ADV.value,
            ]

            # If base vocab parsing
            if parse_into_base_vocab:
                for token in doc:
                    if (
                        bool((lemma := token.lemma_.lower()))
                        and not token.is_stop
                        and not token.like_num
                        and not token.is_space
                        and not token.is_digit
                        and token.is_alpha
                        and token.pos_ in relevant_pos
                        and lemma not in existing_base_vocab
                    ):
                        new_base_vocab.add(lemma)

                continue
            else:
                with open(
                    "assets/reference-vocabulary/vocabulary.base.txt", "r"
                ) as fvb:
                    base_vocab = set(fvb.read().split())
                    with open(
                        "assets/reference-vocabulary/vocabulary.irrelevant.txt",
                        "r",
                    ) as fvi:
                        irrelevant_vocab = set(fvi.read().split())
                        doc = list(
                            filter(
                                lambda t: not t.is_stop
                                and not t.like_num
                                and not t.is_space
                                and not t.is_digit
                                and t.is_alpha
                                and t.pos_ in relevant_pos
                                and (l := t.lemma_.lower()) not in base_vocab
                                and l not in irrelevant_vocab,
                                doc,
                            )
                        )
                # (token, lemma, tag, pos)
                db_data = [
                    (
                        token.text,
                        token.lemma_.lower(),
                        token.tag_,
                        token.pos_,
                    )
                    for token in doc
                ]

                # [API needs]: POST (STATUS.PENDING) -> status_id
                # TODO

                # [API needs]: POST (lemma, status) -> lemma_id
                token_lemmaId_map = {
                    token: add_lemma(Lemma(lemma=lemma, status_id=StatusId(1)))
                    for token, lemma, _, _ in db_data
                }

                # with nlp.select_pipes(enable=('transformer', 'tagger', 'attribute_ruler', 'lemmatizer')):
                #     lemmatized_doc = nlp(context)
                #         for t in lemmatized_doc:
                #             stats.append(t.lemma_.lower() == t.text.lower())
                #         print(f"Unchanged: {sum(stats)} / {len(stats)}")
                #     if i == 50:
                #         break

                # for each possible combination of pipelines, run the doc through the pipeline
                # and check whether the lemma is the same as the text. Save the results in a
                # dictionary with the pipeline combination as key and the number of unchanged
                # lemmas as value
                # if i < 50:
                #     for c in pipeline_combinations:
                #         if "lemmatizer" not in c:
                #             continue
                #         with nlp.select_pipes(enable=c):
                #             lemmatized_doc = nlp(context)
                #             stats[c].append(
                #                 sum(
                #                     t.lemma_.lower() == t.text.lower()
                #                     for t in lemmatized_doc
                #                 )
                #                 / len(lemmatized_doc)
                #             )

            # print(doc)
            # print([w.whitespace_ for w in doc])
    # Write new base vocab in batches
    with open("assets/reference-vocabulary/vocabulary.base.txt", "a") as fvb:
        for word in new_base_vocab:
            fvb.write(word + "\n")

0


In [6]:
# lowest = 1
# lowest_k = ""
# for k, v in stats.items():
#     if sum(v) / len(v) < lowest:
#         lowest = sum(v) / len(v)
#         lowest_k = k
# print(lowest_k, lowest)