# Nested NER with dictionary and n-grams

## Imports

In [1]:
%pip install nltk
%pip install razdel

In [2]:
import json
import os

from collections import Counter, defaultdict
from typing import Iterator

from razdel import tokenize

## Constants

In [3]:
DATA_PATH = "../../data/"
OUT_PATH = "../../out/dict/"

TRAIN_DATA = os.path.join(DATA_PATH, "jsonl/train.jsonl")
TEST_DATA = os.path.join(DATA_PATH, "jsonl/test.jsonl")

SUBMIT_PATH = os.path.join(OUT_PATH, "test.jsonl")


MAX_NGRAM = 5

## Utils

In [4]:
def read_jsonl(file_path: str) -> Iterator[dict]:
    """Reads a file in jsonl format"""
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)


def write_jsonl(file_path: str, data: Iterator[dict]):
    """Writes data to a file in jsonl format"""
    with open(file_path, "w", encoding="utf-8") as f:
        for line in data:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

## Model training

The dictionary is created using the training data. Every token is stored with the collection of labels it is associated with.

Every token is stemmed. If token is a list of words, each word is stemmed and the list is joined with a space.

In [5]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("russian")

In [6]:
def process_text(text: str) -> list:
    """Tokenizes and stems the text"""
    tokenized = list(tokenize(text))

    return " ".join([stemmer.stem(t.text) for t in tokenized])

In [7]:
model = defaultdict(Counter)

for sample in read_jsonl(TRAIN_DATA):
    for beg, end, label in sample["ners"]:
        token = sample["sentences"][beg : end + 1]
        token = process_text(token)

        model[token][label] += 1

## Inference

Tokens are n-grams from 1 to `MAX_NGRAM` length.

Inference is done by choosing the most common label for each token. If a token is not found in the dictionary, it is skipped.

In [8]:
def get_ngrams(text: str, n: int) -> list:
    """Extracts ngrams from the text"""
    tokens = list(tokenize(text))

    if n <= 1:
        return tokens

    # Gather ngrams
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = tokens[i : i + n]
        ngrams.append(ngram)

    # Collapse tokens
    collapsed_ngrams = []

    for ngram in ngrams:
        beg = ngram[0].start
        end = ngram[-1].stop
        ngram_text = " ".join([_.text for _ in ngram])

        collapsed_ngrams.append((beg, end, ngram_text))

    return collapsed_ngrams

In [9]:
predictions = []


for sample in read_jsonl(TEST_DATA):
    ngrams = [get_ngrams(sample["sentences"], n) for n in range(1, MAX_NGRAM + 1)]

    ners = []

    for tokens in ngrams:
        for beg, end, text in tokens:
            text = process_text(text)

            if text in model:
                label = model[text].most_common(1)[0][0]
                ners.append([beg, end - 1, label])

    predictions.append({"id": sample["id"], "ners": ners})

## Submission

In [10]:
os.makedirs(OUT_PATH, exist_ok=True)
write_jsonl(SUBMIT_PATH, predictions)

print("Done! Predictions are saved to", SUBMIT_PATH)

Done! Predictions are saved to ../../out/dict/test.jsonl
