# Comparision of 4 different sentence segmentators for Russian language:
* nltk.sent_tokenize without language argument (which uses "english" by default)
* nltk.sent_tokenize with language='russian' (donwload russian.pkl from https://github.com/mhq/train_punkt )
* a sentence segmentator from https://github.com/bureaucratic-labs/models (`pip install b-labs-models`)
* our `ru_sent_tokenize`

Following datasets are used to calculate metrics on:

* OpenCorpora (downloaded http://opencorpora.org/files/export/annot/annot.opcorpora.xml.bz2 and extract)
* SynTagRus (download *.conllu files from https://github.com/UniversalDependencies/UD_Russian-SynTagRus and put them to a folder)

Modify `OPENCORP_FILE` and `SYNTAGRUS_DIR` constants in the next cell

You will need about 16 GB RAM to run this notebook

In [12]:
from pathlib import Path
import re
from lxml import etree
from itertools import groupby
from operator import itemgetter

import nltk
from b_labs_models import SentenceSegmentator
from ru_sent_tokenize import ru_sent_tokenize

OPENCORPORA = 'opencorpora'
SYNTAGRUS = 'syntagrus'

OPENCORP_FILE = '/data/annot.opcorpora.xml'
SYNTAGRUS_DIR = Path('/data/SynTagRus')
RE_ENDS_WITH_PUNCT = re.compile(r".*\W$")

In [13]:
%%time

_monolit_syntagrus = []
_compound_syntagrus = []

CONLLU_TEXTS_RE = re.compile(r'# sent_id = (.*)_(\d+)\n# text = (.*)\n', re.M)
for fn in SYNTAGRUS_DIR.glob('*.conllu'):
    with fn.open() as f:
        txt = f.read()        

    for g, data_iter in groupby((x.groups() for x in CONLLU_TEXTS_RE.finditer(txt)), key=itemgetter(0)):
        data = sorted(data_iter, key=itemgetter(1))
        for (_, _, s1) in data:
            _monolit_syntagrus.append(s1)

        for (_, _, s1), (_, _, s2) in zip(data[:-1], data[1:]):
            if RE_ENDS_WITH_PUNCT.match(s1):
                _compound_syntagrus.append((s1, s2))
                
print(f'Read {len(_monolit_syntagrus)} sentences from {SYNTAGRUS}')

del txt

Read 61889 sentences from syntagrus
CPU times: user 4.52 s, sys: 20 ms, total: 4.54 s
Wall time: 4.54 s


In [6]:
%%time

sents = list(etree.parse(OPENCORP_FILE).xpath('//source/text()'))

_monolit_oc = []
_compound_oc = []
for s1, s2 in zip(sents[:-1], sents[1:]):
    _monolit_oc.append(s1.strip())
    if RE_ENDS_WITH_PUNCT.match(s1) and not s1.strip().endswith(':') and not s2.strip().startswith('—'):
        _compound_oc.append([s1.strip(), s2.strip()])
        
print(f'Read {len(_monolit_oc)} sentences from {OPENCORPORA}')
        
del sents

Read 108959 sentences from opencorpora
CPU times: user 8.65 s, sys: 1.91 s, total: 10.6 s
Wall time: 10.6 s


In [7]:
_valid_dataset_names = {OPENCORPORA, SYNTAGRUS}
def check_sent_tokenizer(tokenizer, dataset=OPENCORPORA):
    assert dataset in _valid_dataset_names, "dataset can be one of {}".format(_valid_dataset_names)
    
    monoset = _monolit_oc if dataset == OPENCORPORA else _monolit_syntagrus
    biset = _compound_oc if dataset == OPENCORPORA else _compound_syntagrus
    correct_count_mono = 0
    for m in monoset:
        correct_count_mono += len(tokenizer(m)) == 1

    correct_count_comp = 0
    for s1, s2 in biset:
        correct_count_comp += tokenizer(s1 + ' ' + s2) == [s1, s2]

    return correct_count_mono / len(monoset), correct_count_comp / len(biset)

In [8]:
%time m, c = check_sent_tokenizer(nltk.sent_tokenize, OPENCORPORA)
print(f'nltk.sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')
print()
%time m, c = check_sent_tokenizer(nltk.sent_tokenize, SYNTAGRUS)
print(f'nltk.sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')

CPU times: user 8.54 s, sys: 959 µs, total: 8.54 s
Wall time: 8.54 s
nltk.sent_tokenizer scores: 94.30%, 86.06%
CPU times: user 5.09 s, sys: 271 µs, total: 5.09 s
Wall time: 5.09 s
nltk.sent_tokenizer scores: 98.15%, 94.95%


In [9]:
%time m, c = check_sent_tokenizer(lambda s: nltk.sent_tokenize(s, 'russian'), OPENCORPORA)
# donwload from https://github.com/mhq/train_punkt russian.pkl
print(f'nltk.sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')
print()
%time m, c = check_sent_tokenizer(lambda s: nltk.sent_tokenize(s, 'russian'), SYNTAGRUS)
# donwload from https://github.com/mhq/train_punkt russian.pkl
print(f'nltk.sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')

CPU times: user 8.51 s, sys: 16.4 ms, total: 8.53 s
Wall time: 8.53 s
nltk.sent_tokenizer scores: 95.53%, 88.37%
CPU times: user 5.08 s, sys: 31 µs, total: 5.08 s
Wall time: 5.08 s
nltk.sent_tokenizer scores: 98.44%, 95.45%


In [11]:
segmentator = SentenceSegmentator()
%time m, c = check_sent_tokenizer(lambda x: list(segmentator.split(x)), OPENCORPORA)
print(f'loose_sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')
print()
%time m, c = check_sent_tokenizer(lambda x: list(segmentator.split(x)), SYNTAGRUS)
print(f'loose_sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')

CPU times: user 5min 34s, sys: 1.01 s, total: 5min 35s
Wall time: 5min 35s
loose_sent_tokenizer scores: 97.16%, 88.62%

CPU times: user 3min 18s, sys: 124 ms, total: 3min 18s
Wall time: 3min 18s
loose_sent_tokenizer scores: 96.79%, 92.55%


In [14]:
%time m, c = check_sent_tokenizer(ru_sent_tokenize, OPENCORPORA)
print(f'loose_sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')
print()
%time m, c = check_sent_tokenizer(ru_sent_tokenize, SYNTAGRUS)
print(f'loose_sent_tokenizer scores: {m*100:.2f}%, {c*100:.2f}%')

CPU times: user 4.79 s, sys: 0 ns, total: 4.79 s
Wall time: 4.79 s
loose_sent_tokenizer scores: 98.83%, 93.19%

CPU times: user 2.81 s, sys: 0 ns, total: 2.81 s
Wall time: 2.81 s
loose_sent_tokenizer scores: 99.82%, 96.56%
