## RuCor to CoNLL-U

In [1]:
from corpuscula import Conllu
import csv
import difflib
import junky
from mordl import UposTagger, FeatsTagger, LemmaTagger
import os
import pandas as pd
#import random
import re
import textdistance
from toxine import TextPreprocessor
#from uuid import uuid4

#text_dist = textdistance.JaroWinkler().distance

#random.seed(42)  # for uuid

cdict_path = os.path.join('_models/upos-bert_model/cdict.pickle')

dataset_dir = '_dataset'
rucor_dir = os.path.join(dataset_dir, 'rucoref')
rucor_docs_fn = os.path.join(rucor_dir, 'Documents.txt')
rucor_groups_fn = os.path.join(rucor_dir, 'Groups.txt')
rucor_tokens_fn = os.path.join(rucor_dir, 'Tokens.txt')

log_fn = os.path.join(dataset_dir, 'out.log')

I1102 19:16:35.927671 139753066800960 wrapper.py:16] Loading dictionaries from /usr/local/lib/python3.6/dist-packages/pymorphy2_dicts/data
I1102 19:16:35.952976 139753066800960 wrapper.py:20] format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


In [2]:
tagger_u = UposTagger()
tagger_u.load('_models/upos-bert_model', device='cuda:0', dataset_device='cuda:0')
tagger_f = FeatsTagger()
tagger_f.load('_models/feats-bert_model', device='cuda:0', dataset_device='cuda:0')
#tagger_l = LemmaTagger()
#tagger_l.load('_models/lemma-ft_model', device='cuda:0', dataset_device='cuda:0')
tp = TextPreprocessor(cdict_restore_from=cdict_path)

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


Fit corpus dict... done.


In [3]:
docs = pd.read_csv(rucor_docs_fn, sep='\t', index_col='doc_id', quoting=csv.QUOTE_NONE)
groups = pd.read_csv(rucor_groups_fn, sep='\t', index_col='group_id', quoting=csv.QUOTE_NONE)

In [4]:
DOC_ID = 0
TAG_COREF_HEADS = False
NEED_SHIFT_ADJUST = True

In [5]:
def get_fns(doc_id):
    in_fn = os.path.join(rucor_dir, 'rucoref_texts', docs.loc[doc_id, 'path'])
    out_fn_ = os.path.join(dataset_dir, os.path.splitext(os.path.basename(in_fn))[0])
    out_ext_ = '.conllu'
    out_fn = out_fn_ + out_ext_
    return in_fn, out_fn

if DOC_ID:
    in_fn, out_fn = get_fns(DOC_ID)
    print(in_fn, out_fn, sep=', ')

In [6]:
def norm_punct(punct):
    return punct.replace('—', '-').replace(';', '.').replace('...', '.').replace('…', '.') \
                .replace('«', '"').replace('„', '"') \
                .replace('»', '"').replace('“', '"') \
                .replace('``', '"').replace("''", '"')

In [7]:
def get_raw(in_fn):
    re_html = re.compile('&[a-z]+;')

    with open(in_fn, 'rb') as f:
        raw = f.read().decode('utf-8-sig').lower()

        def process(match):
            text = match.group(0)
            len_text = len(text)
            text = tp._unescape_html(text)
            return ' ' * (len_text - len(text)) + text

        raw = re_html.sub(process, raw)

        raw_forms, raw_punct = [], []
        isalnum = None
        for ch in raw:
            res = ch.isalnum()
            if res:
                if res != isalnum:
                    raw_forms.append(ch)
                else:
                    raw_forms[-1] += ch
            elif not ch.isspace():
                raw_punct.append(ch)
            isalnum = res

        raw_forms_ids = []
        idx = 0
        for token in raw_forms:
            idx_ = raw.index(token, idx)
            raw_forms_ids.append(idx_)
            idx = idx_ + len(token)
        idx = 0
        raw_punct_ids = []
        for i, ch in enumerate(raw_punct):
            idx_ = raw.index(ch, idx)
            raw_punct_ids.append(idx_)
            raw_punct[i] = norm_punct(ch)
            idx = idx_ + 1

    return list(zip(raw_forms, raw_forms_ids)), list(zip(raw_punct, raw_punct_ids))

if DOC_ID:
    raw_corpus, raw_puncts = get_raw(in_fn)

In [8]:
# adjust correct shifts to the wrong ones in Rucor
def adjust_raw_corpus(doc_id, raw_corpus):
    for i, (form, idx) in enumerate(raw_corpus):
        if doc_id == 115:
            if idx >= 1288:
                raw_corpus[i] = form, idx + 2
            elif idx >= 771:
                raw_corpus[i] = form, idx + 1
        elif doc_id == 116:
            if idx >= 1545:
                raw_corpus[i] = form, idx + 7
            elif idx >= 884:
                raw_corpus[i] = form, idx + 6
            elif idx >= 858:
                raw_corpus[i] = form, idx + 5
            elif idx >= 394:
                raw_corpus[i] = form, idx + 4
            elif idx >= 388:
                raw_corpus[i] = form, idx + 3
            elif idx >= 386:
                raw_corpus[i] = form, idx + 2
            elif idx >= 165:
                raw_corpus[i] = form, idx + 1

if DOC_ID and NEED_SHIFT_ADJUST:
    adjust_raw_corpus(DOC_ID, raw_corpus)
    adjust_raw_corpus(DOC_ID, raw_puncts)

In [9]:
if DOC_ID:
    tp.clear_corpus()
    tp.load_pars(in_fn, eop=r'\n')

In [10]:
if DOC_ID:
    tp.do_all(tag_date=False, norm_punct=True)

In [11]:
if DOC_ID:
    _ = tp.save(out_fn + '$')

In [12]:
if DOC_ID:
    _ = tagger_f.predict(tagger_u.predict(out_fn + '$'), save_to=out_fn)

In [13]:
if DOC_ID:
    os.remove(out_fn + '$')

In [14]:
re_nonalnum = re.compile('(?:\W|_)+')
re_alnum = re.compile('(?:[^\W_])+')

if DOC_ID:
    corpus_orig = list(Conllu.load(out_fn))
    corpus = []

In [15]:
def make_corpus():
    tag_shortcut = tp.TAG_SHORTCUT[2:]
    masks = list(x[1:] for x in tp.TAG_MASKS.keys())
    for sent in corpus_orig:
        for tok in sent[0]:
            form, misc = tok['FORM'], tok['MISC']
            for misc_ in misc:
                if misc_ in masks:
                    form = misc[misc_]
                elif misc_ == tag_shortcut:
                    form = misc[misc_]
            corpus.append((re_nonalnum.sub('', form.lower()), misc, tok['UPOS'],
                           re_alnum.sub('', form)))

if DOC_ID:
    make_corpus()

In [16]:
def validate_corpus():
    corpus_, raw_corpus_ = re_nonalnum.sub('', ''.join(x[0] for x in corpus)), \
                           ''.join(x[0] for x in raw_corpus)
    if corpus_ != raw_corpus_:
        print('                CORPUS_:')
        print(corpus_)
        print('                RAW_CORPUS_:')
        print(raw_corpus_)
        raise ValueError('The corpus is not the same as the raw corpus!')

if DOC_ID:
    validate_corpus()

In [17]:
def process_corpus(corpus, raw_corpus):

    len_corpus, len_raw_corpus = len(corpus), len(raw_corpus)

    def find_next(i, j):
        form, misc = corpus[i][:2]
        if not form:
            return i + 1, j

        raw_form, raw_form_idx = raw_corpus[j]
        form_, raw_form_ = form, raw_form
        len_form, len_raw_form = len(form), len(raw_form)

        misc['Shift'] = str(raw_form_idx)

        i_, j_ = i + 1, j + 1
        if len_form < len_raw_form:
            while i_ < len_corpus and len(form_) < len_raw_form:
                form_ += corpus[i_][0]
                i_ += 1
            form = form_
        elif len_form > len_raw_form:
            while j_ < len_raw_corpus and len(raw_form_) < len_form:
                raw_form_ += raw_corpus[j_][0]
                j_ += 1
            raw_form = raw_form_

        if form != raw_form:
            raise ValueError('form [{}] is not equal to raw_form [{}]!'.format(form, raw_form))

        return i_, j_,

    mid_ = {'mid': 0}
    def get_mention_id():
        #mid = uuid.uuid4()
        mid_['mid'] += 1
        mid = mid_['mid']
        return str(mid)

    i = j = 0
    while i < len_corpus:
        i_, j_ = find_next(i, j)
        for ii in range(i, i_):  # TODO
            form, misc, upos = corpus[ii][:3]
            if TAG_COREF_HEADS and upos in ['NOUN', 'PRON', 'PROPN']:
                misc['Coref_' + get_mention_id()] = 'Head'
        i, j = i_, j_

if DOC_ID:
    process_corpus(corpus, raw_corpus)

In [18]:
def process_puncts(corpus, raw_puncts):

    sm = difflib.SequenceMatcher()
    len_corpus, len_raw_puncts = len(corpus), len(raw_puncts)

    def find_next(i, j):
        start = stop = 0
        for i in range(i, len_corpus):
            form, misc = corpus[i][:2]
            if not form:
                break
            shift = misc.get('Shift')
            if shift:
                start = int(shift)
        puncts, miscs = [], []
        for i_ in range(i, len_corpus):
            form, misc, _, punct = corpus[i_]
            if form:
                shift = misc.get('Shift')
                if shift:
                    stop = int(shift)
                    break
            else:
                puncts.append(norm_punct(punct))
                miscs.append(misc)
        if not stop:
            return len_corpus, j

        for j in range(j, len_raw_puncts):
            if raw_puncts[j][1] >= start:
                break
        for j_ in range(j, len_raw_puncts):
            if raw_puncts[j_][1] >= stop:
                break
        if j_ == j:
            i_, j_ = len_corpus, len_raw_puncts
        raws, shifts = zip(*raw_puncts[j:j_])
        for ir in range(len(raws) - 3):
            if raws[ir:ir + 3] == ('.',) * 3:
                raws = raws[:ir + 1] + ('', '') + raws[ir + 3:]

        sm.set_seqs(puncts, raws)
        matches = sm.get_matching_blocks()
        for a, b, size in matches:
            for k in range(size):
                miscs[a + k]['Shift'] = str(shifts[b + k])

        return i_, j_,

    i = j = 0
    while i < len_corpus:
        i, j = find_next(i, j)

if DOC_ID:
    process_puncts(corpus, raw_puncts)

In [19]:
if DOC_ID:
    Conllu.save(corpus_orig, out_fn)

In [20]:
if DOC_ID:
    corpus = list(Conllu.load(out_fn))

In [21]:
if DOC_ID:
    for sent in corpus:
        for tok in sent[0]:
            print('{:20s}{}'.format(tok['FORM'] or '', tok['MISC'].get('Shift', '')))

In [22]:
if DOC_ID:
    shifts = {}
    for sent in corpus:
        for tok in sent[0]:
            form, shift = tok['FORM'] or '', tok['MISC'].get('Shift', '')
            if shift:
                shifts[shift] = norm_punct(form)

In [23]:
if DOC_ID:
    raw_corpus

In [24]:
if DOC_ID:
    raw_puncts

In [25]:
if DOC_ID:
    for tks, tk_shifts in groups[groups['doc_id'] == DOC_ID][['content', 'tk_shifts']].values:
        tks = tks.split()
        tk_shifts = tk_shifts.split(',')
        assert len(tks) == len(tk_shifts), \
            'len({}) != len({})'.format(tks, tk_shifts)
        for tk, tk_shift in zip(tks, tk_shifts):
            tok = shifts.get(tk_shift)
            if tok is None:
                print('token {} ({}) is not found'.format(tk_shift, tk))
            elif norm_punct(tok) != norm_punct(tk):
                print('token {}: {} != {}'.format(tk_shift, tok, tk))

In [27]:
with open(log_fn, 'wt', encoding='utf-8') as f_log:

    def log(text=''):
        print(text)
        print(text, file=f_log)

    out_fn = os.path.join(dataset_dir, 'out.conllu')
    for doc_id in groups['doc_id'].unique():
        in_fn, _ = get_fns(doc_id)
        log('{}: {}'.format(doc_id, in_fn))

        raw_corpus, raw_puncts = get_raw(in_fn)
        if NEED_SHIFT_ADJUST:
            adjust_raw_corpus(doc_id, raw_corpus)
            adjust_raw_corpus(doc_id, raw_puncts)

        tp.clear_corpus()
        tp.load_pars(in_fn, eop=r'\n')
        tp.do_all(tag_date=False, norm_punct=True)
        tp.save(out_fn + '$')

        tagger_f.predict(tagger_u.predict(out_fn + '$'), save_to=out_fn)

        corpus_orig = list(Conllu.load(out_fn))
        corpus = []
        make_corpus()
        validate_corpus()
        process_corpus(corpus, raw_corpus)
        process_puncts(corpus, raw_puncts)
        Conllu.save(corpus_orig, out_fn)
        corpus = list(Conllu.load(out_fn))

        shifts = {}
        for sent in corpus:
            for tok in sent[0]:
                form, shift = tok['FORM'] or '', tok['MISC'].get('Shift', '')
                if shift:
                    shifts[shift] = norm_punct(form)
        for tks, tk_shifts in groups[groups['doc_id'] == doc_id][['content', 'tk_shifts']].values:
            tks = tks.split()
            tk_shifts = tk_shifts.split(',')
            assert len(tks) == len(tk_shifts), \
                'len({}) != len({})'.format(tks, tk_shifts)
            for tk, tk_shift in zip(tks, tk_shifts):
                tok = shifts.get(tk_shift)
                if tok is None:
                    print('token {} ({}) is not found'.format(tk_shift, tk))
                    log('token {} ({}) is not found'.format(tk_shift, tk))
                elif norm_punct(tok) != norm_punct(tk):
                    print('token {}: {} != {}'.format(tk_shift, tok, tk))
                    log('token {}: {} != {}'.format(tk_shift, tok, tk))
        log()

Load corpus... 

1: _dataset/rucoref/rucoref_texts/fiction/102_beliajev_nad_bezdnoj.txt
Load corpus
[> 0                                                             [=] 70                                                           
Corpus has been loaded: 70 sentences, 957 tokens
Processing corpus
  0%|          | 0/70 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 70                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 70 sentences, 957 tokens
Save corpus
[> 0                                                             [=] 70                                                           
Corpus has been saved


100%|██████████| 70/70 [00:00<00:00, 181.87it/s]
Processing corpus
100%|██████████| 70/70 [00:00<00:00, 206.04it/s]

Load corpus
[> 0                                                             [=] 70                                                           
Corpus has been loaded: 70 sentences, 957 tokens
Save corpus
[> 0                                                             [=] 70                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 70                                                           
Corpus has been loaded: 70 sentences, 957 tokens
Load corpus... 


token 4827: вас != вас,
token 4827: вас != вас,
token 4849: он != он,
token 4849: он != он,

2: _dataset/rucoref/rucoref_texts/fiction/107_dragunsky_volshebnaja_sila_iskusstva.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 137                                                           
Corpus has been loaded: 137 sentences, 1120 tokens
Processing corpus
  0%|          | 0/137 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 137                                                           
Corpus has been processed: 1 documents, 41 paragraphs, 137 sentences, 1120 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 137                                                           
Corpus has been saved


100%|██████████| 137/137 [00:00<00:00, 317.77it/s]
Processing corpus
100%|██████████| 137/137 [00:00<00:00, 319.25it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 137                                                           
Corpus has been loaded: 137 sentences, 1120 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 137                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 137                                                           
Corpus has been loaded: 137 sentences, 1120 tokens
Load corpus... 


token 3767: Альбертик-то != Альбертик
token 3767: Альбертик-то != Альбертик
token 3776 (-) is not found
token 3776 (-) is not found
token 3777 (то) is not found
token 3777 (то) is not found
token 4096: Альбертиком-то != Альбертиком
token 4096: Альбертиком-то != Альбертиком
token 4107 (-) is not found
token 4107 (-) is not found
token 4108 (то) is not found
token 4108 (то) is not found
token 5081: какой-то != какой
token 5081: какой-то != какой
token 5086 (-) is not found
token 5086 (-) is not found
token 5087 (то) is not found
token 5087 (то) is not found

3: _dataset/rucoref/rucoref_texts/fiction/15_paustovsky_zhilcy_starogo_doma.txt
Load corpus
[=] 54                                                           
Corpus has been loaded: 54 sentences, 862 tokens
Processing corpus
100%|██████████| 54/54 [00:00<00:00, 428962.91it/s]


done.
Preprocess corpus
[> 0                                                             [=] 54                                                           
Corpus has been processed: 1 documents, 21 paragraphs, 54 sentences, 862 tokens
Save corpus
[> 0                                                             [=] 54                                                           
Corpus has been saved


Processing corpus
100%|██████████| 54/54 [00:00<00:00, 364135.72it/s]


Load corpus
[> 0                                                             [=] 54                                                           
Corpus has been loaded: 54 sentences, 862 tokens
Save corpus
[> 0                                                             [=] 54                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 54                                                           
Corpus has been loaded: 54 sentences, 862 tokens
Load corpus... 


5: _dataset/rucoref/rucoref_texts/fiction/2_astafiev_zhizn_prozhit.txt
Load corpus
[=] 44                                                           
Corpus has been loaded: 44 sentences, 1104 tokens
Processing corpus
100%|██████████| 44/44 [00:00<00:00, 351522.62it/s]

done.
Preprocess corpus
[> 0                                                             [=] 44                                                           
Corpus has been processed: 1 documents, 8 paragraphs, 44 sentences, 1104 tokens
Save corpus
[> 0                                                             [=] 44                                                           
Corpus has been saved



Processing corpus
100%|██████████| 44/44 [00:00<00:00, 347550.61it/s]


Load corpus
[> 0                                                             [=] 44                                                           
Corpus has been loaded: 44 sentences, 1104 tokens
Save corpus
[> 0                                                             [=] 44                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 44                                                           
Corpus has been loaded: 44 sentences, 1104 tokens
Load corpus... 


6: _dataset/rucoref/rucoref_texts/fiction/30_dojl_sluchaj.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 135                                                           
Corpus has been loaded: 135 sentences, 2022 tokens
Processing corpus
  0%|          | 0/135 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 135                                                           
Corpus has been processed: 1 documents, 49 paragraphs, 135 sentences, 2022 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 135                                                           
Corpus has been saved


100%|██████████| 135/135 [00:00<00:00, 145.13it/s]
Processing corpus
100%|██████████| 135/135 [00:00<00:00, 143.09it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 135                                                           
Corpus has been loaded: 135 sentences, 2022 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 135                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 135                                                           
Corpus has been loaded: 135 sentences, 2022 tokens
Load corpus... 



7: _dataset/rucoref/rucoref_texts/fiction/34_kassil_solnce_svetit.txt
Load corpus
[> 0                                                             [=] 96                                                           
Corpus has been loaded: 96 sentences, 1502 tokens
Processing corpus
  0%|          | 0/96 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 96                                                           
Corpus has been processed: 1 documents, 40 paragraphs, 96 sentences, 1502 tokens
Save corpus
[> 0                                                             [=] 96                                                           
Corpus has been saved


100%|██████████| 96/96 [00:00<00:00, 205.64it/s]
Processing corpus
100%|██████████| 96/96 [00:00<00:00, 204.72it/s]


Load corpus
[> 0                                                             [=] 96                                                           
Corpus has been loaded: 96 sentences, 1502 tokens
Save corpus
[> 0                                                             [=] 96                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 96                                                           
Corpus has been loaded: 96 sentences, 1502 tokens
Load corpus... 

token 631: какой-то != какой
token 631: какой-то != какой
token 636 (-) is not found
token 636 (-) is not found
token 637 (то) is not found
token 637 (то) is not found

8: _dataset/rucoref/rucoref_texts/fiction/43_musatov_stozhary.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 111                                                           
Corpus has been loaded: 111 sentences, 1414 tokens
Processing corpus
  0%|          | 0/111 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 111                                                           
Corpus has been processed: 1 documents, 59 paragraphs, 111 sentences, 1414 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 111                                                           
Corpus has been saved


100%|██████████| 111/111 [00:00<00:00, 318.98it/s]
Processing corpus
100%|██████████| 111/111 [00:00<00:00, 315.97it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 111                                                           
Corpus has been loaded: 111 sentences, 1414 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 111                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 111                                                           
Corpus has been loaded: 111 sentences, 1414 tokens
Load corpus... 


9: _dataset/rucoref/rucoref_texts/fiction/44_nagibin_siren.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 172                                                           
Corpus has been loaded: 172 sentences, 3099 tokens
Processing corpus
  0%|          | 0/172 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 172                                                           
Corpus has been processed: 1 documents, 33 paragraphs, 172 sentences, 3099 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 172                                                           
Corpus has been saved


100%|██████████| 172/172 [00:00<00:00, 199.71it/s]
Processing corpus
100%|██████████| 172/172 [00:00<00:00, 198.42it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 172                                                           
Corpus has been loaded: 172 sentences, 3099 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 172                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 172                                                           
Corpus has been loaded: 172 sentences, 3099 tokens
Load corpus... 


10: _dataset/rucoref/rucoref_texts/fiction/53_beliajev_dom_s_prividenijami.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 130                                                           
Corpus has been loaded: 130 sentences, 1554 tokens
Processing corpus
  0%|          | 0/130 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 130                                                           
Corpus has been processed: 1 documents, 53 paragraphs, 130 sentences, 1554 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 130                                                           
Corpus has been saved


100%|██████████| 130/130 [00:00<00:00, 215.21it/s]
Processing corpus
100%|██████████| 130/130 [00:00<00:00, 219.12it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 130                                                           
Corpus has been loaded: 130 sentences, 1554 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 130                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 130                                                           
Corpus has been loaded: 130 sentences, 1554 tokens
Load corpus... 



11: _dataset/rucoref/rucoref_texts/fiction/5_petrushevskaya_v_detstve.txt
Load corpus
[> 0                                                             [=] 85                                                           
Corpus has been loaded: 85 sentences, 897 tokens
Processing corpus
  0%|          | 0/85 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 85                                                           
Corpus has been processed: 1 documents, 30 paragraphs, 85 sentences, 897 tokens
Save corpus
[> 0                                                             [=] 85                                                           
Corpus has been saved


100%|██████████| 85/85 [00:00<00:00, 223.50it/s]
Processing corpus
100%|██████████| 85/85 [00:00<00:00, 223.50it/s]

Load corpus
[> 0                                                             [=] 85                                                           
Corpus has been loaded: 85 sentences, 897 tokens
Save corpus
[> 0                                                             [=] 85                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 85                                                           
Corpus has been loaded: 85 sentences, 897 tokens
Load corpus... 


token 2798: младше-классника != младше
token 2798: младше-классника != младше
token 2804 (-) is not found
token 2804 (-) is not found
token 2805 (классника) is not found
token 2805 (классника) is not found

12: _dataset/rucoref/rucoref_texts/fiction/67_zamiatin_kolumb.txt
Load corpus
[> 0                                                             [=] 73                                                           
Corpus has been loaded: 73 sentences, 883 tokens
Processing corpus
  0%|          | 0/73 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 73                                                           
Corpus has been processed: 1 documents, 31 paragraphs, 73 sentences, 883 tokens
Save corpus
[> 0                                                             [=] 73                                                           
Corpus has been saved


100%|██████████| 73/73 [00:00<00:00, 233.00it/s]
Processing corpus
100%|██████████| 73/73 [00:00<00:00, 237.97it/s]

Load corpus
[> 0                                                             [=] 73                                                           
Corpus has been loaded: 73 sentences, 883 tokens
Save corpus
[> 0                                                             [=] 73                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 73                                                           
Corpus has been loaded: 73 sentences, 883 tokens
Load corpus... 



13: _dataset/rucoref/rucoref_texts/fiction/73_ilf_schastlivy_otec.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 874 tokens
Processing corpus
  0%|          | 0/103 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been processed: 1 documents, 52 paragraphs, 103 sentences, 874 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved


100%|██████████| 103/103 [00:00<00:00, 435.93it/s]
Processing corpus
100%|██████████| 103/103 [00:00<00:00, 430.77it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 874 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 874 tokens
Load corpus... 


token 3205: т. != т
token 3205: т. != т
token 3206 (.) is not found
token 3206 (.) is not found
token 3205: т. != т
token 3205: т. != т
token 3206 (.) is not found
token 3206 (.) is not found
token 3926: Сундучанский-отец != Сундучанский
token 3926: Сундучанский-отец != Сундучанский
token 3938 (-) is not found
token 3938 (-) is not found
token 3939 (отец) is not found
token 3939 (отец) is not found

15: _dataset/rucoref/rucoref_texts/fiction/andersen_motylek.txt
Load corpus
[=] 75                                                           
Corpus has been loaded: 75 sentences, 751 tokens
Processing corpus
  0%|          | 0/75 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 75                                                           
Corpus has been processed: 1 documents, 28 paragraphs, 75 sentences, 751 tokens
Save corpus
[> 0                                                             [=] 75                                                           
Corpus has been saved


100%|██████████| 75/75 [00:00<00:00, 189.80it/s]
Processing corpus
100%|██████████| 75/75 [00:00<00:00, 191.96it/s]

Load corpus
[> 0                                                             [=] 75                                                           
Corpus has been loaded: 75 sentences, 751 tokens
Save corpus
[> 0                                                             [=] 75                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 75                                                           
Corpus has been loaded: 75 sentences, 751 tokens
Load corpus... 



16: _dataset/rucoref/rucoref_texts/fiction/bazhov_travyanaja_zapadenka.txt
Load corpus
[> 0                                                             [=] 71                                                           
Corpus has been loaded: 71 sentences, 735 tokens
Processing corpus
  0%|          | 0/71 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 71                                                           
Corpus has been processed: 1 documents, 20 paragraphs, 71 sentences, 735 tokens
Save corpus
[> 0                                                             [=] 71                                                           
Corpus has been saved


100%|██████████| 71/71 [00:00<00:00, 326.63it/s]
Processing corpus
100%|██████████| 71/71 [00:00<00:00, 327.16it/s]

Load corpus
[> 0                                                             [=] 71                                                           
Corpus has been loaded: 71 sentences, 735 tokens
Save corpus
[> 0                                                             [=] 71                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 71                                                           
Corpus has been loaded: 71 sentences, 735 tokens
Load corpus... 



17: _dataset/rucoref/rucoref_texts/fiction/bunin_skazka.txt
Load corpus
[> 0                                                             [=] 94                                                           
Corpus has been loaded: 94 sentences, 1510 tokens
Processing corpus
  0%|          | 0/94 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 94                                                           
Corpus has been processed: 1 documents, 20 paragraphs, 94 sentences, 1510 tokens
Save corpus
[> 0                                                             [=] 94                                                           
Corpus has been saved


100%|██████████| 94/94 [00:00<00:00, 199.96it/s]
Processing corpus
100%|██████████| 94/94 [00:00<00:00, 197.39it/s]


Load corpus
[> 0                                                             [=] 94                                                           
Corpus has been loaded: 94 sentences, 1510 tokens
Save corpus
[> 0                                                             [=] 94                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 94                                                           
Corpus has been loaded: 94 sentences, 1510 tokens
Load corpus... 

token 3335: такой-то != такой
token 3335: такой-то != такой
token 3340 (-) is not found
token 3340 (-) is not found
token 3341 (то) is not found
token 3341 (то) is not found

18: _dataset/rucoref/rucoref_texts/fiction/dostojevskij_podrostok.txt
Load corpus
[=] 28                                                           
Corpus has been loaded: 28 sentences, 667 tokens
Processing corpus
100%|██████████| 28/28 [00:00<00:00, 99949.37it/s]

done.
Preprocess corpus
[> 0                                                             [=] 28                                                           
Corpus has been processed: 1 documents, 3 paragraphs, 28 sentences, 667 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved



Processing corpus
100%|██████████| 28/28 [00:00<00:00, 283672.73it/s]


Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 667 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 667 tokens
Load corpus... 


19: _dataset/rucoref/rucoref_texts/fiction/dovlatov_kompromiss_6.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 113                                                           
Corpus has been loaded: 113 sentences, 957 tokens
Processing corpus
  0%|          | 0/113 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 113                                                           
Corpus has been processed: 1 documents, 32 paragraphs, 113 sentences, 957 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 113                                                           
Corpus has been saved


100%|██████████| 113/113 [00:00<00:00, 249.06it/s]
Processing corpus
100%|██████████| 113/113 [00:00<00:00, 244.40it/s]
token 167: Л. != Л
token 167: Л. != Л
token 168 (.) is not found
token 168 (.) is not found

20: _dataset/rucoref/rucoref_texts/fiction/fet_knyaginya.txt


Load corpus
[> 0                                                             [> 100                                                            [=] 113                                                           
Corpus has been loaded: 113 sentences, 957 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 113                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 113                                                           
Corpus has been loaded: 113 sentences, 957 tokens
Load corpus... 

Load corpus
[> 0                                                             [> 100                                                            [=] 126                                                           
Corpus has been loaded: 126 sentences, 1850 tokens
Processing corpus
  0%|          | 0/126 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 126                                                           
Corpus has been processed: 1 documents, 36 paragraphs, 126 sentences, 1850 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 126                                                           
Corpus has been saved


100%|██████████| 126/126 [00:00<00:00, 247.45it/s]
Processing corpus
100%|██████████| 126/126 [00:00<00:00, 248.26it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 126                                                           
Corpus has been loaded: 126 sentences, 1850 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 126                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 126                                                           
Corpus has been loaded: 126 sentences, 1850 tokens
Load corpus... 

token 9261: каким-то != каким
token 9261: каким-то != каким
token 9266 (-) is not found
token 9266 (-) is not found
token 9267 (то) is not found
token 9267 (то) is not found

21: _dataset/rucoref/rucoref_texts/fiction/gilyarovskij_moi_skitanija.txt
Load corpus
[=] 63                                                           
Corpus has been loaded: 63 sentences, 1333 tokens
Processing corpus
100%|██████████| 63/63 [00:00<00:00, 159662.33it/s]

done.
Preprocess corpus
[> 0                                                             [=] 63                                                           
Corpus has been processed: 1 documents, 19 paragraphs, 63 sentences, 1333 tokens
Save corpus
[> 0                                                             [=] 63                                                           
Corpus has been saved



Processing corpus
100%|██████████| 63/63 [00:00<00:00, 533820.51it/s]


Load corpus
[> 0                                                             [=] 63                                                           
Corpus has been loaded: 63 sentences, 1333 tokens
Save corpus
[> 0                                                             [=] 63                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 63                                                           
Corpus has been loaded: 63 sentences, 1333 tokens
Load corpus... 

token 2774: Н. != Н
token 2774: Н. != Н
token 2775 (.) is not found
token 2775 (.) is not found
token 2777: Д. != Д
token 2777: Д. != Д
token 2778 (.) is not found
token 2778 (.) is not found

22: _dataset/rucoref/rucoref_texts/fiction/gogol_zapiski_3.txt
Load corpus
[> 0                                                             [=] 87                                                           
Corpus has been loaded: 87 sentences, 1201 tokens
Processing corpus
  0%|          | 0/87 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 87                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 87 sentences, 1201 tokens
Save corpus
[> 0                                                             [=] 87                                                           
Corpus has been saved


100%|██████████| 87/87 [00:00<00:00, 288.78it/s]
Processing corpus
100%|██████████| 87/87 [00:00<00:00, 289.22it/s]

Load corpus
[> 0                                                             [=] 87                                                           
Corpus has been loaded: 87 sentences, 1201 tokens
Save corpus
[> 0                                                             [=] 87                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 87                                                           
Corpus has been loaded: 87 sentences, 1201 tokens
Load corpus... 



23: _dataset/rucoref/rucoref_texts/fiction/harms_upadanije.txt
Load corpus
[=] 26                                                           
Corpus has been loaded: 26 sentences, 405 tokens
Processing corpus
100%|██████████| 26/26 [00:00<00:00, 234520.22it/s]
Processing corpus
100%|██████████| 26/26 [00:00<00:00, 269263.96it/s]

done.
Preprocess corpus
[> 0                                                             [=] 26                                                           
Corpus has been processed: 1 documents, 9 paragraphs, 26 sentences, 405 tokens
Save corpus
[> 0                                                             [=] 26                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 26                                                           
Corpus has been loaded: 26 sentences, 405 tokens
Save corpus
[> 0                                                             [=] 26                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 26                                                           
Corpus has been loaded: 26 sentences, 405 tokens
Load corpus... 


24: _dataset/rucoref/rucoref_texts/fiction/korolenko_mgnovenije.txt
Load corpus
[=] 28                                                           
Corpus has been loaded: 28 sentences, 632 tokens
Processing corpus
100%|██████████| 28/28 [00:00<00:00, 234412.20it/s]


done.
Preprocess corpus
[> 0                                                             [=] 28                                                           
Corpus has been processed: 1 documents, 10 paragraphs, 28 sentences, 632 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved


Processing corpus
100%|██████████| 28/28 [00:00<00:00, 251478.61it/s]


Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 632 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 632 tokens
Load corpus... 

token 315: Хозе-Мигуэль != Хозе
token 315: Хозе-Мигуэль != Хозе
token 320 (Мигуэль) is not found
token 320 (Мигуэль) is not found
token 327 (-) is not found
token 327 (-) is not found
token 2006: Мигуэль-Хозе != Мигуэль
token 2006: Мигуэль-Хозе != Мигуэль
token 2014 (Хозе) is not found
token 2014 (Хозе) is not found
token 2018 (-) is not found
token 2018 (-) is not found

28: _dataset/rucoref/rucoref_texts/fiction/turgenev_veshnije_vody.txt
Load corpus
[=] 53                                                           
Corpus has been loaded: 53 sentences, 1199 tokens
Processing corpus
100%|██████████| 53/53 [00:00<00:00, 197422.83it/s]

done.
Preprocess corpus
[> 0                                                             [=] 53                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 53 sentences, 1199 tokens
Save corpus
[> 0                                                             [=] 53                                                           
Corpus has been saved



Processing corpus
100%|██████████| 53/53 [00:00<00:00, 132162.97it/s]


Load corpus
[> 0                                                             [=] 53                                                           
Corpus has been loaded: 53 sentences, 1199 tokens
Save corpus
[> 0                                                             [=] 53                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 53                                                           
Corpus has been loaded: 53 sentences, 1199 tokens
Load corpus... 

token 1693: госпожа != г
token 1693: госпожа != г
token 1694 (-) is not found
token 1694 (-) is not found
token 1695 (жа) is not found
token 1695 (жа) is not found

30: _dataset/rucoref/rucoref_texts/Lenta/2013_04_11_dotless_.txt
Load corpus
[=] 30                                                           
Corpus has been loaded: 30 sentences, 382 tokens
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 269441.37it/s]
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 279620.27it/s]

done.
Preprocess corpus
[> 0                                                             [=] 30                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 30 sentences, 382 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 382 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 382 tokens
Load corpus... 

token 1251: cloud.blog != cloud
token 1251: cloud.blog != cloud
token 1256 (,) is not found
token 1256 (,) is not found
token 1258 (.) is not found
token 1258 (.) is not found
token 1259 (blog) is not found
token 1259 (blog) is not found

41: _dataset/rucoref/rucoref_texts/Lenta/2013_07_31_krebs_.txt
Load corpus
[=] 26                                                           
Corpus has been loaded: 26 sentences, 481 tokens
Processing corpus
100%|██████████| 26/26 [00:00<00:00, 56474.32it/s]
Processing corpus
100%|██████████| 26/26 [00:00<00:00, 118663.66it/s]

done.
Preprocess corpus
[> 0                                                             [=] 26                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 26 sentences, 481 tokens
Save corpus
[> 0                                                             [=] 26                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 26                                                           
Corpus has been loaded: 26 sentences, 481 tokens
Save corpus
[> 0                                                             [=] 26                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 26                                                           
Corpus has been loaded: 26 sentences, 481 tokens
Load corpus... 


51: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-01-19-cutshort.txt
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 435 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 42550.91it/s]


done.
Preprocess corpus
[> 0                                                             [=] 14                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 14 sentences, 435 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved


Processing corpus
100%|██████████| 14/14 [00:00<00:00, 104857.60it/s]


Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 435 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 435 tokens
Load corpus... 


52: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-01-24-if.txt
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 356 tokens
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 57136.03it/s]
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 63946.42it/s]

done.
Preprocess corpus
[> 0                                                             [=] 22                                                           
Corpus has been processed: 1 documents, 8 paragraphs, 22 sentences, 356 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 356 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 356 tokens
Load corpus... 

token 2: Ивано-Франковске != Ивано
token 2: Ивано-Франковске != Ивано
token 7 (-) is not found
token 7 (-) is not found
token 8 (Франковске) is not found
token 8 (Франковске) is not found

53: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-01-30-crimea.txt
Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 272 tokens
Processing corpus
100%|██████████| 16/16 [00:00<00:00, 40970.00it/s]
Processing corpus
100%|██████████| 16/16 [00:00<00:00, 92820.01it/s]

done.
Preprocess corpus
[=] 16                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 16 sentences, 272 tokens
Save corpus
[=] 16                                                           
Corpus has been saved
Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 272 tokens
Save corpus
[=] 16                                                           
Corpus has been saved
Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 272 tokens
Load corpus... 



54: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-02-03-capitanic.txt


done.
Preprocess corpus
[> 0                                                             

Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 279 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 20061.58it/s]
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 136241.89it/s]


[=] 14                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 14 sentences, 279 tokens
Save corpus
[=] 14                                                           
Corpus has been saved
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 279 tokens
Save corpus
[=] 14                                                           
Corpus has been saved
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 279 tokens
Load corpus... done.
Preprocess corpus
[> 0                                                             

token 0: Премьер-министр != Премьер
token 0: Премьер-министр != Премьер
token 7 (-) is not found
token 7 (-) is not found
token 8 (министр) is not found
token 8 (министр) is not found
token 477: Премьер-министр != Премьер
token 477: Премьер-министр != Премьер
token 484 (-) is not found
token 484 (-) is not found
token 485 (министр) is not found
token 485 (министр) is not found

55: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-02-03-london.txt
Load corpus
[=] 28                                                           
Corpus has been loaded: 28 sentences, 489 tokens
Processing corpus
100%|██████████| 28/28 [00:00<00:00, 36035.75it/s]


[=] 28                                                           
Corpus has been processed: 1 documents, 9 paragraphs, 28 sentences, 489 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved


Processing corpus
100%|██████████| 28/28 [00:00<00:00, 96818.23it/s]


Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 489 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 489 tokens
Load corpus... 


56: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-02-03-name1.txt
Load corpus
[=] 23                                                           
Corpus has been loaded: 23 sentences, 359 tokens
Processing corpus
100%|██████████| 23/23 [00:00<00:00, 102083.59it/s]
Processing corpus
100%|██████████| 23/23 [00:00<00:00, 96856.42it/s]

done.
Preprocess corpus
[> 0                                                             [=] 23                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 23 sentences, 359 tokens
Save corpus
[> 0                                                             [=] 23                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 23                                                           
Corpus has been loaded: 23 sentences, 359 tokens
Save corpus
[> 0                                                             [=] 23                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 23                                                           
Corpus has been loaded: 23 sentences, 359 tokens
Load corpus... 


57: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-02-03-rucksack.txt
Load corpus
[=] 15                                                           
Corpus has been loaded: 15 sentences, 282 tokens
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 55874.39it/s]
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 41610.16it/s]

done.
Preprocess corpus
[> 0                                                             [=] 15                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 15 sentences, 282 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved





Load corpus
[=] 15                                                           
Corpus has been loaded: 15 sentences, 282 tokens
Save corpus
[=] 15                                                           
Corpus has been saved
Load corpus
[=] 15                                                           
Corpus has been loaded: 15 sentences, 282 tokens
Load corpus... done.
Preprocess corpus
[=] 21                                                           
Corpus has been processed: 1 documents, 8 paragraphs, 21 sentences, 389 tokens
Save corpus
[=] 21                                                           
Corpus has been saved



58: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-02-04-party.txt
Load corpus
[=] 21                                                           
Corpus has been loaded: 21 sentences, 389 tokens
Processing corpus
100%|██████████| 21/21 [00:00<00:00, 16248.00it/s]
Processing corpus
100%|██████████| 21/21 [00:00<00:00, 25814.88it/s]


Load corpus
[> 0                                                             [=] 21                                                           
Corpus has been loaded: 21 sentences, 389 tokens
Save corpus
[> 0                                                             [=] 21                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 21                                                           
Corpus has been loaded: 21 sentences, 389 tokens
Load corpus... 


59: _dataset/rucoref/rucoref_texts/Lenta/lenta.ru-news-2014-02-04-pyramid.txt
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 404 tokens
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 68351.62it/s]
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 194262.50it/s]

done.
Preprocess corpus
[> 0                                                             [=] 22                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 22 sentences, 404 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 404 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 404 tokens
Load corpus... 


60: _dataset/rucoref/rucoref_texts/OpenCorpora/448-done.txt
Load corpus
[=] 30                                                           
Corpus has been loaded: 30 sentences, 738 tokens
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 80401.99it/s]


done.
Preprocess corpus
[> 0                                                             [=] 30                                                           
Corpus has been processed: 1 documents, 11 paragraphs, 30 sentences, 738 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved


Processing corpus
100%|██████████| 30/30 [00:00<00:00, 272948.20it/s]


Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 738 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 738 tokens
Load corpus... 

token 149: Нью-Йорка != Нью
token 149: Нью-Йорка != Нью
token 152 (-) is not found
token 152 (-) is not found
token 153 (Йорка) is not found
token 153 (Йорка) is not found
token 616: 16-летняя != 16
token 616: 16-летняя != 16
token 618 (-) is not found
token 618 (-) is not found
token 619 (летняя) is not found
token 619 (летняя) is not found
token 638: 74-летняя != 74
token 638: 74-летняя != 74
token 640 (-) is not found
token 640 (-) is not found
token 641 (летняя) is not found
token 641 (летняя) is not found
token 1720: Бут. != Бут
token 1720: Бут. != Бут
token 2718: Нью-Йорке != Нью
token 2718: Нью-Йорке != Нью
token 2721 (-) is not found
token 2721 (-) is not found
token 2722 (Йорке) is not found
token 2722 (Йорке) is not found
token 3054: Нью-Йорка != Нью
token 3054: Нью-Йорка != Нью
token 3057 (-) is not found
token 3057 (-) is not found
token 3058 (Йорка) is not found
token 3058 (Йорка) is not found
token 3538: 43-летнему != 43
token 3538: 43-летнему != 43
token 3540 (-) is not 

done.
Preprocess corpus
[> 0                                                             [=] 24                                                           
Corpus has been processed: 1 documents, 3 paragraphs, 24 sentences, 403 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 403 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 403 tokens
Load corpus... 


62: _dataset/rucoref/rucoref_texts/OpenCorpora/540-done.txt
Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 271 tokens
Processing corpus
100%|██████████| 16/16 [00:00<00:00, 87154.37it/s]
Processing corpus
100%|██████████| 16/16 [00:00<00:00, 89717.73it/s]

done.
Preprocess corpus
[> 0                                                             [=] 16                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 16 sentences, 271 tokens
Save corpus
[> 0                                                             [=] 16                                                           
Corpus has been saved





Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 271 tokens
Save corpus
[=] 16                                                           
Corpus has been saved
Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 271 tokens
Load corpus... 


63: _dataset/rucoref/rucoref_texts/OpenCorpora/554-done.txt
Load corpus
[=] 30                                                           
Corpus has been loaded: 30 sentences, 612 tokens
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 28813.63it/s]


done.
Preprocess corpus
[> 0                                                             [=] 30                                                           
Corpus has been processed: 1 documents, 17 paragraphs, 30 sentences, 612 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved


Processing corpus
100%|██████████| 30/30 [00:00<00:00, 100824.62it/s]


Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 612 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 612 tokens
Load corpus... 


64: _dataset/rucoref/rucoref_texts/OpenCorpora/559-done.txt
Load corpus
[=] 17                                                           
Corpus has been loaded: 17 sentences, 312 tokens
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 157055.44it/s]
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 168964.85it/s]

done.
Preprocess corpus
[> 0                                                             [=] 17                                                           
Corpus has been processed: 1 documents, 9 paragraphs, 17 sentences, 312 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 312 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 312 tokens
Load corpus... 

token 1547:  != Zülch
token 1547:  != Zülch

65: _dataset/rucoref/rucoref_texts/OpenCorpora/675-done.txt
Load corpus
[=] 33                                                           
Corpus has been loaded: 33 sentences, 669 tokens
Processing corpus
100%|██████████| 33/33 [00:00<00:00, 278495.03it/s]


done.
Preprocess corpus
[> 0                                                             [=] 33                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 33 sentences, 669 tokens
Save corpus
[> 0                                                             [=] 33                                                           
Corpus has been saved


Processing corpus
100%|██████████| 33/33 [00:00<00:00, 111174.32it/s]


Load corpus
[> 0                                                             [=] 33                                                           
Corpus has been loaded: 33 sentences, 669 tokens
Save corpus
[> 0                                                             [=] 33                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 33                                                           
Corpus has been loaded: 33 sentences, 669 tokens
Load corpus... 


66: _dataset/rucoref/rucoref_texts/OpenCorpora/682-done.txt
Load corpus
[=] 20                                                           
Corpus has been loaded: 20 sentences, 393 tokens
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 165130.08it/s]
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 113821.00it/s]

done.
Preprocess corpus
[> 0                                                             [=] 20                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 20 sentences, 393 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 393 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 393 tokens
Load corpus... 

token 96: 65-летия != 65
token 96: 65-летия != 65
token 98 (-) is not found
token 98 (-) is not found
token 99 (летия) is not found
token 99 (летия) is not found
token 1116: премьер-министра != премьер
token 1116: премьер-министра != премьер
token 1123 (-) is not found
token 1123 (-) is not found
token 1124 (министра) is not found
token 1124 (министра) is not found

67: _dataset/rucoref/rucoref_texts/OpenCorpora/689-done.txt
Load corpus
[=] 31                                                           
Corpus has been loaded: 31 sentences, 669 tokens
Processing corpus
100%|██████████| 31/31 [00:00<00:00, 266441.44it/s]


done.
Preprocess corpus
[> 0                                                             [=] 31                                                           
Corpus has been processed: 1 documents, 11 paragraphs, 31 sentences, 669 tokens
Save corpus
[> 0                                                             [=] 31                                                           
Corpus has been saved


Processing corpus
100%|██████████| 31/31 [00:00<00:00, 98130.89it/s]


Load corpus
[> 0                                                             [=] 31                                                           
Corpus has been loaded: 31 sentences, 669 tokens
Save corpus
[> 0                                                             [=] 31                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 31                                                           
Corpus has been loaded: 31 sentences, 669 tokens
Load corpus... 

token 1942: господина != г
token 1942: господина != г
token 1943 (-) is not found
token 1943 (-) is not found
token 1944 (на) is not found
token 1944 (на) is not found
token 1942: господина != г
token 1942: господина != г
token 1943 (-) is not found
token 1943 (-) is not found
token 1944 (на) is not found
token 1944 (на) is not found
token 2819: господине != г
token 2819: господине != г
token 2820 (-) is not found
token 2820 (-) is not found
token 2821 (не) is not found
token 2821 (не) is not found
token 2975: санкт-петербургской != санкт
token 2975: санкт-петербургской != санкт
token 2980 (-) is not found
token 2980 (-) is not found
token 2981 (петербургской) is not found
token 2981 (петербургской) is not found

68: _dataset/rucoref/rucoref_texts/OpenCorpora/789-done.txt
Load corpus
[=] 23                                                           
Corpus has been loaded: 23 sentences, 552 tokens
Processing corpus
100%|██████████| 23/23 [00:00<00:00, 53445.42it/s]

done.
Preprocess corpus
[> 0                                                             [=] 23                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 23 sentences, 552 tokens
Save corpus
[> 0                                                             [=] 23                                                           
Corpus has been saved



Processing corpus
100%|██████████| 23/23 [00:00<00:00, 213426.97it/s]


Load corpus
[> 0                                                             [=] 23                                                           
Corpus has been loaded: 23 sentences, 552 tokens
Save corpus
[> 0                                                             [=] 23                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 23                                                           
Corpus has been loaded: 23 sentences, 552 tokens
Load corpus... 

token 236: Нью-Йорке != Нью
token 236: Нью-Йорке != Нью
token 239 (-) is not found
token 239 (-) is not found
token 240 (Йорке) is not found
token 240 (Йорке) is not found
token 3266: Нью-Йорке != Нью
token 3266: Нью-Йорке != Нью
token 3269 (-) is not found
token 3269 (-) is not found
token 3270 (Йорке) is not found
token 3270 (Йорке) is not found

69: _dataset/rucoref/rucoref_texts/OpenCorpora/833-done.txt
Load corpus
[=] 20                                                           
Corpus has been loaded: 20 sentences, 344 tokens
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 64182.16it/s]
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 109655.01it/s]

done.
Preprocess corpus
[> 0                                                             [=] 20                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 20 sentences, 344 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 344 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 344 tokens
Load corpus... 

token 815 (—) is not found
token 815 (—) is not found

70: _dataset/rucoref/rucoref_texts/OpenCorpora/842-done.txt
Load corpus
[=] 10                                                           
Corpus has been loaded: 10 sentences, 296 tokens
Processing corpus
100%|██████████| 10/10 [00:00<00:00, 42842.74it/s]
Processing corpus
100%|██████████| 10/10 [00:00<00:00, 106454.42it/s]

done.
Preprocess corpus
[> 0                                                             [=] 10                                                           
Corpus has been processed: 1 documents, 3 paragraphs, 10 sentences, 296 tokens
Save corpus
[> 0                                                             [=] 10                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 10                                                           
Corpus has been loaded: 10 sentences, 296 tokens
Save corpus
[> 0                                                             [=] 10                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 10                                                           
Corpus has been loaded: 10 sentences, 296 tokens
Load corpus... 

token 494: Уголовно-процессуальному != Уголовно
token 494: Уголовно-процессуальному != Уголовно
token 502 (-) is not found
token 502 (-) is not found
token 503 (процессуальному) is not found
token 503 (процессуальному) is not found
token 494: Уголовно-процессуальному != Уголовно
token 494: Уголовно-процессуальному != Уголовно
token 502 (-) is not found
token 502 (-) is not found
token 503 (процессуальному) is not found
token 503 (процессуальному) is not found
token 776: М. != М
token 776: М. != М
token 777 (.) is not found
token 777 (.) is not found
token 1085: М. != М
token 1085: М. != М
token 1086 (.) is not found
token 1086 (.) is not found

71: _dataset/rucoref/rucoref_texts/OpenCorpora/850-done.txt
Load corpus
[=] 25                                                           
Corpus has been loaded: 25 sentences, 488 tokens
Processing corpus
100%|██████████| 25/25 [00:00<00:00, 53939.09it/s]
Processing corpus
100%|██████████| 25/25 [00:00<00:00, 90238.90it/s]

done.
Preprocess corpus
[> 0                                                             [=] 25                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 25 sentences, 488 tokens
Save corpus
[> 0                                                             [=] 25                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 25                                                           
Corpus has been loaded: 25 sentences, 488 tokens
Save corpus
[> 0                                                             [=] 25                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 25                                                           
Corpus has been loaded: 25 sentences, 488 tokens
Load corpus... 

token 51: 93-летний != 93
token 51: 93-летний != 93
token 53 (-) is not found
token 53 (-) is not found
token 54 (летний) is not found
token 54 (летний) is not found

72: _dataset/rucoref/rucoref_texts/OpenCorpora/870-done.txt


done.
Preprocess corpus
[> 0                                                             [=] 53                                                           
Corpus has been processed: 1 documents, 17 paragraphs, 53 sentences, 720 tokens


Load corpus
[=] 53                                                           
Corpus has been loaded: 53 sentences, 720 tokens
Processing corpus
100%|██████████| 53/53 [00:00<00:00, 223190.88it/s]

Save corpus
[> 0                                                             [=] 53                                                           
Corpus has been saved



Processing corpus
100%|██████████| 53/53 [00:00<00:00, 456464.30it/s]


Load corpus
[> 0                                                             [=] 53                                                           
Corpus has been loaded: 53 sentences, 720 tokens
Save corpus
[> 0                                                             [=] 53                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 53                                                           
Corpus has been loaded: 53 sentences, 720 tokens
Load corpus... 

token 160: муж. != муж
token 160: муж. != муж
token 695: по-видимому != по
token 695: по-видимому != по
token 697 (-) is not found
token 697 (-) is not found
token 698 (видимому) is not found
token 698 (видимому) is not found

73: _dataset/rucoref/rucoref_texts/OpenCorpora/890-done.txt
Load corpus
[=] 17                                                           
Corpus has been loaded: 17 sentences, 348 tokens
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 142321.69it/s]
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 82910.66it/s]

done.
Preprocess corpus
[> 0                                                             [=] 17                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 17 sentences, 348 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 348 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 348 tokens
Load corpus... 


74: _dataset/rucoref/rucoref_texts/OpenCorpora/895-done.txt
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 308 tokens
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 59724.72it/s]
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 92182.51it/s]

done.
Preprocess corpus
[> 0                                                             [=] 22                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 22 sentences, 308 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved





Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 308 tokens
Save corpus
[=] 22                                                           
Corpus has been saved
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 308 tokens
Load corpus... 


75: _dataset/rucoref/rucoref_texts/OpenCorpora/908-done.txt
Load corpus
[=] 13                                                           
Corpus has been loaded: 13 sentences, 267 tokens
Processing corpus
100%|██████████| 13/13 [00:00<00:00, 60316.32it/s]
Processing corpus
100%|██████████| 13/13 [00:00<00:00, 117260.11it/s]

done.
Preprocess corpus
[> 0                                                             [=] 13                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 13 sentences, 267 tokens
Save corpus
[> 0                                                             [=] 13                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 13                                                           
Corpus has been loaded: 13 sentences, 267 tokens
Save corpus
[> 0                                                             [=] 13                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 13                                                           
Corpus has been loaded: 13 sentences, 267 tokens
Load corpus... 


76: _dataset/rucoref/rucoref_texts/OpenCorpora/921-done.txt
Load corpus
[=] 49                                                           
Corpus has been loaded: 49 sentences, 830 tokens
Processing corpus
100%|██████████| 49/49 [00:00<00:00, 114241.74it/s]


done.
Preprocess corpus
[> 0                                                             [=] 49                                                           
Corpus has been processed: 1 documents, 34 paragraphs, 49 sentences, 830 tokens
Save corpus
[> 0                                                             [=] 49                                                           
Corpus has been saved


Processing corpus
100%|██████████| 49/49 [00:00<00:00, 174170.25it/s]


Load corpus
[> 0                                                             [=] 49                                                           
Corpus has been loaded: 49 sentences, 830 tokens
Save corpus
[> 0                                                             [=] 49                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 49                                                           
Corpus has been loaded: 49 sentences, 830 tokens
Load corpus... 


77: _dataset/rucoref/rucoref_texts/OpenCorpora/992-done.txt
Load corpus
[=] 34                                                           
Corpus has been loaded: 34 sentences, 831 tokens
Processing corpus
100%|██████████| 34/34 [00:00<00:00, 269068.56it/s]

done.
Preprocess corpus
[> 0                                                             [=] 34                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 34 sentences, 831 tokens
Save corpus
[> 0                                                             [=] 34                                                           
Corpus has been saved



Processing corpus
100%|██████████| 34/34 [00:00<00:00, 96813.53it/s]


Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 831 tokens
Save corpus
[> 0                                                             [=] 34                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 831 tokens
Load corpus... 


78: _dataset/rucoref/rucoref_texts/Science/09Jan2014_bloomberg_sleep.html.txt
Load corpus
[=] 24                                                           
Corpus has been loaded: 24 sentences, 449 tokens
Processing corpus
100%|██████████| 24/24 [00:00<00:00, 56236.48it/s]
Processing corpus
100%|██████████| 24/24 [00:00<00:00, 99273.47it/s]

done.
Preprocess corpus
[> 0                                                             [=] 24                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 24 sentences, 449 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 449 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 449 tokens
Load corpus... 

token 434: Э. != Э.Чейслер
token 434: Э. != Э.Чейслер
token 2533: Э. != Э.Гринфилд
token 2533: Э. != Э.Гринфилд

80: _dataset/rucoref/rucoref_texts/fiction/strugackije_ponedelnik.txt
Load corpus
[=] 54                                                           
Corpus has been loaded: 54 sentences, 757 tokens
Processing corpus
100%|██████████| 54/54 [00:00<00:00, 272554.05it/s]


done.
Preprocess corpus
[> 0                                                             [=] 54                                                           
Corpus has been processed: 1 documents, 3 paragraphs, 54 sentences, 757 tokens
Save corpus
[> 0                                                             [=] 54                                                           
Corpus has been saved


Processing corpus
100%|██████████| 54/54 [00:00<00:00, 152622.92it/s]


Load corpus
[> 0                                                             [=] 54                                                           
Corpus has been loaded: 54 sentences, 757 tokens
Save corpus
[> 0                                                             [=] 54                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 54                                                           
Corpus has been loaded: 54 sentences, 757 tokens
Load corpus... 


81: _dataset/rucoref/rucoref_texts/News/2009-abbas5_ru.txt
Load corpus
[=] 37                                                           
Corpus has been loaded: 37 sentences, 872 tokens
Processing corpus
100%|██████████| 37/37 [00:00<00:00, 323310.93it/s]

done.
Preprocess corpus
[> 0                                                             [=] 37                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 37 sentences, 872 tokens
Save corpus
[> 0                                                             [=] 37                                                           
Corpus has been saved



Processing corpus
100%|██████████| 37/37 [00:00<00:00, 254408.60it/s]


Load corpus
[> 0                                                             [=] 37                                                           
Corpus has been loaded: 37 sentences, 872 tokens
Save corpus
[> 0                                                             [=] 37                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 37                                                           
Corpus has been loaded: 37 sentences, 872 tokens
Load corpus... 

token 217: индо-пакистанское != индо
token 217: индо-пакистанское != индо
token 221 (-) is not found
token 221 (-) is not found
token 222 (пакистанское) is not found
token 222 (пакистанское) is not found
token 2237: 1960-х != 1960
token 2237: 1960-х != 1960
token 2241 (-) is not found
token 2241 (-) is not found
token 2242 (х) is not found
token 2242 (х) is not found
token 2246: 1970-х != 1970
token 2246: 1970-х != 1970
token 2250 (-) is not found
token 2250 (-) is not found
token 2251 (х) is not found
token 2251 (х) is not found
token 2679: Лашкар-и != Лашкар
token 2679: Лашкар-и != Лашкар
token 2686 (и) is not found
token 2686 (и) is not found
token 2687 (-) is not found
token 2687 (-) is not found
token 3452: Джамаат-уд != Джамаат
token 3452: Джамаат-уд != Джамаат
token 3460 (уд) is not found
token 3460 (уд) is not found
token 3462 (-) is not found
token 3462 (-) is not found
token 4166: "Мы != Мы
token 4166: "Мы != Мы

82: _dataset/rucoref/rucoref_texts/News/2009-abbas6_ru.txt
Load

done.
Preprocess corpus
[> 0                                                             [=] 41                                                           
Corpus has been processed: 1 documents, 13 paragraphs, 41 sentences, 870 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved



Processing corpus
100%|██████████| 41/41 [00:00<00:00, 125066.52it/s]


Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 870 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 870 tokens
Load corpus... 

token 1130: Северо-западной != Северо
token 1130: Северо-западной != Северо
token 1136 (-) is not found
token 1136 (-) is not found
token 1137 (западной) is not found
token 1137 (западной) is not found

83: _dataset/rucoref/rucoref_texts/News/2009-abbas7_ru.txt
Load corpus
[=] 38                                                           
Corpus has been loaded: 38 sentences, 769 tokens
Processing corpus
100%|██████████| 38/38 [00:00<00:00, 332049.07it/s]

done.
Preprocess corpus
[> 0                                                             [=] 38                                                           
Corpus has been processed: 1 documents, 13 paragraphs, 38 sentences, 769 tokens
Save corpus
[> 0                                                             [=] 38                                                           
Corpus has been saved



Processing corpus
100%|██████████| 38/38 [00:00<00:00, 314987.26it/s]


Load corpus
[> 0                                                             [=] 38                                                           
Corpus has been loaded: 38 sentences, 769 tokens
Save corpus
[> 0                                                             [=] 38                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 38                                                           
Corpus has been loaded: 38 sentences, 769 tokens
Load corpus... 

token 656: Rah-e != Rah
token 656: Rah-e != Rah
token 659 (-) is not found
token 659 (-) is not found
token 660 (e) is not found
token 660 (e) is not found
token 661 (-) is not found
token 661 (-) is not found
token 662: -Nijat != Nijat
token 662: -Nijat != Nijat

84: _dataset/rucoref/rucoref_texts/News/2009-abusada7_ru.txt
Load corpus
[=] 42                                                           
Corpus has been loaded: 42 sentences, 865 tokens
Processing corpus
100%|██████████| 42/42 [00:00<00:00, 211477.51it/s]

done.
Preprocess corpus
[> 0                                                             [=] 42                                                           
Corpus has been processed: 1 documents, 14 paragraphs, 42 sentences, 865 tokens
Save corpus
[> 0                                                             [=] 42                                                           
Corpus has been saved



Processing corpus
100%|██████████| 42/42 [00:00<00:00, 108607.13it/s]


Load corpus
[> 0                                                             [=] 42                                                           
Corpus has been loaded: 42 sentences, 865 tokens
Save corpus
[> 0                                                             [=] 42                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 42                                                           
Corpus has been loaded: 42 sentences, 865 tokens
Load corpus... 

token 674: Аль-Каиды != Аль
token 674: Аль-Каиды != Аль
token 677 (-) is not found
token 677 (-) is not found
token 678 (Каиды) is not found
token 678 (Каиды) is not found
token 1652: Аль-Каиды != Аль
token 1652: Аль-Каиды != Аль
token 1655 (-) is not found
token 1655 (-) is not found
token 1656 (Каиды) is not found
token 1656 (Каиды) is not found
token 1964: Аль-Каиды != Аль
token 1964: Аль-Каиды != Аль
token 1967 (-) is not found
token 1967 (-) is not found
token 1968 (Каиды) is not found
token 1968 (Каиды) is not found
token 2090: Аль-Каиды != Аль
token 2090: Аль-Каиды != Аль
token 2093 (-) is not found
token 2093 (-) is not found
token 2094 (Каиды) is not found
token 2094 (Каиды) is not found
token 2861: Аль-Каидой != Аль
token 2861: Аль-Каидой != Аль
token 2864 (-) is not found
token 2864 (-) is not found
token 2865 (Каидой) is not found
token 2865 (Каидой) is not found
token 3272: Аль-Каиды != Аль
token 3272: Аль-Каиды != Аль
token 3275 (-) is not found
token 3275 (-) is not foun

done.
Preprocess corpus
[> 0                                                             [=] 28                                                           
Corpus has been processed: 1 documents, 10 paragraphs, 28 sentences, 694 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved



Processing corpus
100%|██████████| 28/28 [00:00<00:00, 223270.94it/s]


Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 694 tokens
Save corpus
[> 0                                                             [=] 28                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 28                                                           
Corpus has been loaded: 28 sentences, 694 tokens
Load corpus... 


86: _dataset/rucoref/rucoref_texts/News/2009-annan2_ru.txt
Load corpus
[=] 27                                                           
Corpus has been loaded: 27 sentences, 624 tokens
Processing corpus
100%|██████████| 27/27 [00:00<00:00, 121770.12it/s]

done.
Preprocess corpus
[> 0                                                             [=] 27                                                           
Corpus has been processed: 1 documents, 9 paragraphs, 27 sentences, 624 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved



Processing corpus
100%|██████████| 27/27 [00:00<00:00, 130769.29it/s]


Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 624 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 624 tokens
Load corpus... 


87: _dataset/rucoref/rucoref_texts/News/2009-annan3_ru.txt
Load corpus
[=] 36                                                           
Corpus has been loaded: 36 sentences, 973 tokens
Processing corpus
100%|██████████| 36/36 [00:00<00:00, 63230.71it/s]

done.
Preprocess corpus
[> 0                                                             [=] 36                                                           
Corpus has been processed: 1 documents, 13 paragraphs, 36 sentences, 973 tokens
Save corpus
[> 0                                                             [=] 36                                                           
Corpus has been saved



Processing corpus
100%|██████████| 36/36 [00:00<00:00, 145047.98it/s]


Load corpus
[> 0                                                             [=] 36                                                           
Corpus has been loaded: 36 sentences, 973 tokens
Save corpus
[> 0                                                             [=] 36                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 36                                                           
Corpus has been loaded: 36 sentences, 973 tokens
Load corpus... 

token 3401: Бреттон-Вудские != Бреттон
token 3401: Бреттон-Вудские != Бреттон
token 3408 (-) is not found
token 3408 (-) is not found
token 3409 (Вудские) is not found
token 3409 (Вудские) is not found

88: _dataset/rucoref/rucoref_texts/News/2009-aslund24_ru.txt
Load corpus
[=] 47                                                           
Corpus has been loaded: 47 sentences, 1030 tokens
Processing corpus
100%|██████████| 47/47 [00:00<00:00, 406458.33it/s]

done.
Preprocess corpus
[> 0                                                             [=] 47                                                           
Corpus has been processed: 1 documents, 14 paragraphs, 47 sentences, 1030 tokens
Save corpus
[> 0                                                             [=] 47                                                           
Corpus has been saved



Processing corpus
100%|██████████| 47/47 [00:00<00:00, 114545.20it/s]


Load corpus
[> 0                                                             [=] 47                                                           
Corpus has been loaded: 47 sentences, 1030 tokens
Save corpus
[> 0                                                             [=] 47                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 47                                                           
Corpus has been loaded: 47 sentences, 1030 tokens
Load corpus... 

token 29: премьер-министр != премьер
token 29: премьер-министр != премьер
token 36 (-) is not found
token 36 (-) is not found
token 37 (министр) is not found
token 37 (министр) is not found
token 917:  != ВТО
token 917:  != ВТО
token 3083:  != 2020
token 3083:  != 2020
token 3087 (”) is not found
token 3087 (”) is not found
token 3501:  != инерции
token 3501:  != инерции
token 3508 (”) is not found
token 3508 (”) is not found

89: _dataset/rucoref/rucoref_texts/News/2009-asteiner3_ru.txt
Load corpus
[=] 27                                                           
Corpus has been loaded: 27 sentences, 805 tokens
Processing corpus
100%|██████████| 27/27 [00:00<00:00, 222051.39it/s]

done.
Preprocess corpus
[> 0                                                             [=] 27                                                           
Corpus has been processed: 1 documents, 14 paragraphs, 27 sentences, 805 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved



Processing corpus
100%|██████████| 27/27 [00:00<00:00, 241463.13it/s]


Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 805 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 805 tokens
Load corpus... 

token 1625: Мьюнг-Бака != Мьюнг
token 1625: Мьюнг-Бака != Мьюнг
token 1630 (-) is not found
token 1630 (-) is not found
token 1631 (Бака) is not found
token 1631 (Бака) is not found

90: _dataset/rucoref/rucoref_texts/News/2009-asteiner5_ru.txt
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 641 tokens
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 54120.05it/s]

done.
Preprocess corpus
[> 0                                                             [=] 22                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 22 sentences, 641 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved



Processing corpus
100%|██████████| 22/22 [00:00<00:00, 75203.49it/s]


Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 641 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 641 tokens
Load corpus... 


91: _dataset/rucoref/rucoref_texts/News/2009-avineri35_ru.txt
Load corpus
[=] 35                                                           
Corpus has been loaded: 35 sentences, 797 tokens
Processing corpus
100%|██████████| 35/35 [00:00<00:00, 298375.28it/s]

done.
Preprocess corpus
[> 0                                                             [=] 35                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 35 sentences, 797 tokens
Save corpus
[> 0                                                             [=] 35                                                           
Corpus has been saved



Processing corpus
100%|██████████| 35/35 [00:00<00:00, 128210.17it/s]


Load corpus
[> 0                                                             [=] 35                                                           
Corpus has been loaded: 35 sentences, 797 tokens
Save corpus
[> 0                                                             [=] 35                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 35                                                           
Corpus has been loaded: 35 sentences, 797 tokens
Load corpus... 

token 3308: премьер-министр != премьер
token 3308: премьер-министр != премьер
token 3315 (-) is not found
token 3315 (-) is not found
token 3316 (министр) is not found
token 3316 (министр) is not found

92: _dataset/rucoref/rucoref_texts/News/2009-bakker3_ru.txt
Load corpus
[=] 39                                                           
Corpus has been loaded: 39 sentences, 757 tokens
Processing corpus
100%|██████████| 39/39 [00:00<00:00, 256794.12it/s]

done.
Preprocess corpus
[> 0                                                             [=] 39                                                           
Corpus has been processed: 1 documents, 13 paragraphs, 39 sentences, 757 tokens
Save corpus
[> 0                                                             [=] 39                                                           
Corpus has been saved



Processing corpus
100%|██████████| 39/39 [00:00<00:00, 313969.01it/s]


Load corpus
[> 0                                                             [=] 39                                                           
Corpus has been loaded: 39 sentences, 757 tokens
Save corpus
[> 0                                                             [=] 39                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 39                                                           
Corpus has been loaded: 39 sentences, 757 tokens
Load corpus... 


93: _dataset/rucoref/rucoref_texts/News/2009-bakker4_ru.txt
Load corpus
[=] 39                                                           
Corpus has been loaded: 39 sentences, 816 tokens
Processing corpus
100%|██████████| 39/39 [00:00<00:00, 342213.09it/s]

done.
Preprocess corpus
[> 0                                                             [=] 39                                                           
Corpus has been processed: 1 documents, 13 paragraphs, 39 sentences, 816 tokens
Save corpus
[> 0                                                             [=] 39                                                           
Corpus has been saved



Processing corpus
100%|██████████| 39/39 [00:00<00:00, 345100.96it/s]


Load corpus
[> 0                                                             [=] 39                                                           
Corpus has been loaded: 39 sentences, 816 tokens
Save corpus
[> 0                                                             [=] 39                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 39                                                           
Corpus has been loaded: 39 sentences, 816 tokens
Load corpus... 

token 86: G-20 != G
token 86: G-20 != G
token 87 (-) is not found
token 87 (-) is not found
token 88 (20) is not found
token 88 (20) is not found
token 1119: G-20 != G
token 1119: G-20 != G
token 1120 (-) is not found
token 1120 (-) is not found
token 1121 (20) is not found
token 1121 (20) is not found
token 1687: G-20 != G
token 1687: G-20 != G
token 1688 (-) is not found
token 1688 (-) is not found
token 1689 (20) is not found
token 1689 (20) is not found
token 1687: G-20 != G
token 1687: G-20 != G
token 1688 (-) is not found
token 1688 (-) is not found
token 1689 (20) is not found
token 1689 (20) is not found
token 2429: G-20 != G
token 2429: G-20 != G
token 2430 (-) is not found
token 2430 (-) is not found
token 2431 (20) is not found
token 2431 (20) is not found
token 2521: G-20 != G
token 2521: G-20 != G
token 2522 (-) is not found
token 2522 (-) is not found
token 2523 (20) is not found
token 2523 (20) is not found
token 3066: G-20 != G
token 3066: G-20 != G
token 3067 (-) is no

done.
Preprocess corpus
[> 0                                                             [=] 39                                                           
Corpus has been processed: 1 documents, 15 paragraphs, 39 sentences, 943 tokens
Save corpus
[> 0                                                             [=] 39                                                           
Corpus has been saved



Processing corpus
100%|██████████| 39/39 [00:00<00:00, 150347.29it/s]


Load corpus
[> 0                                                             [=] 39                                                           
Corpus has been loaded: 39 sentences, 943 tokens
Save corpus
[> 0                                                             [=] 39                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 39                                                           
Corpus has been loaded: 39 sentences, 943 tokens
Load corpus... 

token 4008: низкообогащенный != низко
token 4008: низкообогащенный != низко
token 4013 (-) is not found
token 4013 (-) is not found
token 4014 (обогащенный) is not found
token 4014 (обогащенный) is not found

95: _dataset/rucoref/rucoref_texts/News/2009-barbarosie1_ru.txt
Load corpus
[=] 36                                                           
Corpus has been loaded: 36 sentences, 767 tokens
Processing corpus
100%|██████████| 36/36 [00:00<00:00, 179542.15it/s]

done.
Preprocess corpus
[> 0                                                             [=] 36                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 36 sentences, 767 tokens
Save corpus
[> 0                                                             [=] 36                                                           
Corpus has been saved



Processing corpus
100%|██████████| 36/36 [00:00<00:00, 318554.73it/s]


Load corpus
[> 0                                                             [=] 36                                                           
Corpus has been loaded: 36 sentences, 767 tokens
Save corpus
[> 0                                                             [=] 36                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 36                                                           
Corpus has been loaded: 36 sentences, 767 tokens
Load corpus... 


96: _dataset/rucoref/rucoref_texts/News/2009-barnett1_ru.txt
Load corpus
[=] 37                                                           
Corpus has been loaded: 37 sentences, 1050 tokens
Processing corpus
100%|██████████| 37/37 [00:00<00:00, 145172.36it/s]

done.
Preprocess corpus
[> 0                                                             [=] 37                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 37 sentences, 1050 tokens
Save corpus
[> 0                                                             [=] 37                                                           
Corpus has been saved



Processing corpus
100%|██████████| 37/37 [00:00<00:00, 136370.17it/s]


Load corpus
[> 0                                                             [=] 37                                                           
Corpus has been loaded: 37 sentences, 1050 tokens
Save corpus
[> 0                                                             [=] 37                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 37                                                           
Corpus has been loaded: 37 sentences, 1050 tokens
Load corpus... 


97: _dataset/rucoref/rucoref_texts/News/2009-barroso1_ru.txt
Load corpus
[=] 34                                                           
Corpus has been loaded: 34 sentences, 950 tokens
Processing corpus
100%|██████████| 34/34 [00:00<00:00, 149639.39it/s]

done.
Preprocess corpus
[> 0                                                             [=] 34                                                           
Corpus has been processed: 1 documents, 14 paragraphs, 34 sentences, 950 tokens
Save corpus
[> 0                                                             [=] 34                                                           
Corpus has been saved



Processing corpus
100%|██████████| 34/34 [00:00<00:00, 111672.93it/s]


Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 950 tokens
Save corpus
[> 0                                                             [=] 34                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 950 tokens
Load corpus... 


98: _dataset/rucoref/rucoref_texts/News/2009-barroso3_ru.txt
Load corpus
[=] 59                                                           
Corpus has been loaded: 59 sentences, 965 tokens
Processing corpus
100%|██████████| 59/59 [00:00<00:00, 459970.14it/s]

done.
Preprocess corpus
[> 0                                                             [=] 59                                                           
Corpus has been processed: 1 documents, 15 paragraphs, 59 sentences, 965 tokens
Save corpus
[> 0                                                             [=] 59                                                           
Corpus has been saved



Processing corpus
100%|██████████| 59/59 [00:00<00:00, 160586.59it/s]


Load corpus
[> 0                                                             [=] 59                                                           
Corpus has been loaded: 59 sentences, 965 tokens
Save corpus
[> 0                                                             [=] 59                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 59                                                           
Corpus has been loaded: 59 sentences, 965 tokens
Load corpus... 


99: _dataset/rucoref/rucoref_texts/News/2009-beasley1_ru.txt
Load corpus
[=] 34                                                           
Corpus has been loaded: 34 sentences, 845 tokens
Processing corpus
100%|██████████| 34/34 [00:00<00:00, 301493.31it/s]

done.
Preprocess corpus
[> 0                                                             [=] 34                                                           
Corpus has been processed: 1 documents, 14 paragraphs, 34 sentences, 845 tokens
Save corpus
[> 0                                                             [=] 34                                                           
Corpus has been saved



Processing corpus
100%|██████████| 34/34 [00:00<00:00, 140915.35it/s]


Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 845 tokens
Save corpus
[> 0                                                             [=] 34                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 845 tokens
Load corpus... 

token 405: 80-х != 80
token 405: 80-х != 80
token 407 (-) is not found
token 407 (-) is not found
token 408 (х) is not found
token 408 (х) is not found
token 410: гг. != гг
token 410: гг. != гг
token 1405:  != 200�000
token 1405:  != 200�000
token 3598: респираторно-синцитиальный != респираторно
token 3598: респираторно-синцитиальный != респираторно
token 3610 (-) is not found
token 3610 (-) is not found
token 3611 (синцитиальный) is not found
token 3611 (синцитиальный) is not found

100: _dataset/rucoref/rucoref_texts/News/2009-bebchuk1_ru.txt
Load corpus
[=] 30                                                           
Corpus has been loaded: 30 sentences, 732 tokens
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 77148.45it/s]

done.
Preprocess corpus
[> 0                                                             [=] 30                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 30 sentences, 732 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved



Processing corpus
100%|██████████| 30/30 [00:00<00:00, 195386.83it/s]


Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 732 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 732 tokens
Load corpus... 


111: _dataset/rucoref/rucoref_texts/OpenCorpora/347-done.txt
Load corpus
[=] 15                                                           
Corpus has been loaded: 15 sentences, 375 tokens
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 35010.88it/s]
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 149796.57it/s]

done.
Preprocess corpus
[> 0                                                             [=] 15                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 15 sentences, 375 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 375 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 375 tokens
Load corpus... 


112: _dataset/rucoref/rucoref_texts/OpenCorpora/388-done.txt
Load corpus
[=] 20                                                           
Corpus has been loaded: 20 sentences, 526 tokens
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 61141.46it/s]


done.
Preprocess corpus
[> 0                                                             [=] 20                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 20 sentences, 526 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved


Processing corpus
100%|██████████| 20/20 [00:00<00:00, 182758.34it/s]


Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 526 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 526 tokens
Load corpus... 

token 1063: М. != М.П
token 1063: М. != М.П
token 1066 (.) is not found
token 1066 (.) is not found

113: _dataset/rucoref/rucoref_texts/Otzyvy/www.turpravda.ru-gr-halkidiki-Potidea_Palace-h13681-r59401.txt
Load corpus
[=] 40                                                           
Corpus has been loaded: 40 sentences, 528 tokens
Processing corpus
100%|██████████| 40/40 [00:00<00:00, 43262.55it/s]


done.
Preprocess corpus
[> 0                                                             [=] 40                                                           
Corpus has been processed: 1 documents, 1 paragraphs, 40 sentences, 528 tokens
Save corpus
[> 0                                                             [=] 40                                                           
Corpus has been saved


Processing corpus
100%|██████████| 40/40 [00:00<00:00, 109726.72it/s]


Load corpus
[> 0                                                             [=] 40                                                           
Corpus has been loaded: 40 sentences, 528 tokens
Save corpus
[> 0                                                             [=] 40                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 40                                                           
Corpus has been loaded: 40 sentences, 528 tokens
Load corpus... 


114: _dataset/rucoref/rucoref_texts/Otzyvy/www.turpravda.ru-gr-halkidiki-Potidea_Palace-h13681-r67576.txt
Load corpus
[=] 17                                                           
Corpus has been loaded: 17 sentences, 428 tokens
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 81118.51it/s]

done.
Preprocess corpus
[> 0                                                             [=] 17                                                           
Corpus has been processed: 1 documents, 1 paragraphs, 17 sentences, 428 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved



Processing corpus
100%|██████████| 17/17 [00:00<00:00, 148858.39it/s]


Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 428 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 428 tokens
Load corpus... 

token 298 (моему) is not found
token 298 (моему) is not found

115: _dataset/rucoref/rucoref_texts/Otzyvy/www.turpravda.ru-gr-halkidiki-Potidea_Palace-h13681-r68708.txt
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 445 tokens
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 52015.04it/s]
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 104147.50it/s]

done.
Preprocess corpus
[> 0                                                             [=] 22                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 22 sentences, 445 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 445 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 445 tokens
Load corpus... 

token 73: нем. != нем
token 73: нем. != нем
token 288: 4+ != 4
token 288: 4+ != 4
token 289 (+) is not found
token 289 (+) is not found
token 1300: Гуру != гуру
token 1300: Гуру != гуру

116: _dataset/rucoref/rucoref_texts/Otzyvy/www.turpravda.ru-gr-halkidiki-Potidea_Palace-h13681-r69802.txt
Load corpus
[=] 27                                                           
Corpus has been loaded: 27 sentences, 330 tokens
Processing corpus
100%|██████████| 27/27 [00:00<00:00, 129869.50it/s]
Processing corpus
100%|██████████| 27/27 [00:00<00:00, 185345.68it/s]

done.
Preprocess corpus
[> 0                                                             [=] 27                                                           
Corpus has been processed: 1 documents, 1 paragraphs, 27 sentences, 330 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 330 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 330 tokens
Load corpus... 


117: _dataset/rucoref/rucoref_texts/Otzyvy/www.turpravda.ru-gr-halkidiki-Potidea_Palace-h13681-r72202.txt
Load corpus
[=] 24                                                           
Corpus has been loaded: 24 sentences, 1016 tokens
Processing corpus
100%|██████████| 24/24 [00:00<00:00, 186759.36it/s]


done.
Preprocess corpus
[> 0                                                             [=] 24                                                           
Corpus has been processed: 1 documents, 14 paragraphs, 24 sentences, 1016 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved


Processing corpus
100%|██████████| 24/24 [00:00<00:00, 86853.58it/s]


Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 1016 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 1016 tokens
Load corpus... 


118: _dataset/rucoref/rucoref_texts/PhotoDescr/PhotoDescr1.txt
Load corpus
[=] 6                                                            
Corpus has been loaded: 6 sentences, 107 tokens
Processing corpus
100%|██████████| 6/6 [00:00<00:00, 57719.78it/s]
Processing corpus
100%|██████████| 6/6 [00:00<00:00, 66225.85it/s]

done.
Preprocess corpus
[> 0                                                             [=] 6                                                           
Corpus has been processed: 1 documents, 1 paragraphs, 6 sentences, 107 tokens
Save corpus
[> 0                                                             [=] 6                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 6                                                           
Corpus has been loaded: 6 sentences, 107 tokens
Save corpus
[> 0                                                             [=] 6                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 6                                                           
Corpus has been loaded: 6 sentences, 107 tokens
Load corpus... 



119: _dataset/rucoref/rucoref_texts/PhotoDescr/PhotoDescr11.txt
Load corpus
[=] 9                                                            
Corpus has been loaded: 9 sentences, 187 tokens
Processing corpus
100%|██████████| 9/9 [00:00<00:00, 28361.18it/s]

done.
Preprocess corpus
[> 0                                                             [=] 9                                                           
Corpus has been processed: 1 documents, 2 paragraphs, 9 sentences, 187 tokens
Save corpus
[> 0                                                             [=] 9                                                           
Corpus has been saved



Processing corpus
100%|██████████| 9/9 [00:00<00:00, 86381.55it/s]


Load corpus
[> 0                                                             [=] 9                                                           
Corpus has been loaded: 9 sentences, 187 tokens
Save corpus
[> 0                                                             [=] 9                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 9                                                           
Corpus has been loaded: 9 sentences, 187 tokens
Load corpus... 

token 28: какого-то != какого
token 28: какого-то != какого
token 34 (-) is not found
token 34 (-) is not found
token 35 (то) is not found
token 35 (то) is not found

120: _dataset/rucoref/rucoref_texts/PhotoDescr/PhotoDescr15.txt
Load corpus
[=] 6                                                            
Corpus has been loaded: 6 sentences, 83 tokens
Processing corpus
100%|██████████| 6/6 [00:00<00:00, 34952.53it/s]

done.
Preprocess corpus
[> 0                                                             [=] 6                                                           
Corpus has been processed: 1 documents, 1 paragraphs, 6 sentences, 83 tokens
Save corpus
[> 0                                                             [=] 6                                                           
Corpus has been saved



Processing corpus
100%|██████████| 6/6 [00:00<00:00, 25140.68it/s]


Load corpus
[> 0                                                             [=] 6                                                           
Corpus has been loaded: 6 sentences, 83 tokens
Save corpus
[> 0                                                             [=] 6                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 6                                                           
Corpus has been loaded: 6 sentences, 83 tokens
Load corpus... 


131: _dataset/rucoref/rucoref_texts/Science/nauka i zhizn_mars.txt


done.
Preprocess corpus
[=] 174                                                           
Corpus has been processed: 1 documents, 55 paragraphs, 174 sentences, 3704 tokens
Save corpus
[=] 174                                                           
Corpus has been saved


Load corpus
[=] 174                                                           
Corpus has been loaded: 174 sentences, 3704 tokens
Processing corpus
100%|██████████| 174/174 [00:01<00:00, 158.88it/s]
Processing corpus
100%|██████████| 174/174 [00:01<00:00, 158.55it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been loaded: 174 sentences, 3704 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been loaded: 174 sentences, 3704 tokens
Load corpus... 


132: _dataset/rucoref/rucoref_texts/Science/nauka i zhizn_pererabotka.txt
Load corpus
[> 0                                                             [=] 95                                                           
Corpus has been loaded: 95 sentences, 1812 tokens
Processing corpus
  0%|          | 0/95 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 95                                                           
Corpus has been processed: 1 documents, 46 paragraphs, 95 sentences, 1812 tokens
Save corpus
[> 0                                                             [=] 95                                                           
Corpus has been saved


100%|██████████| 95/95 [00:00<00:00, 184.50it/s]
Processing corpus
100%|██████████| 95/95 [00:00<00:00, 184.76it/s]


Load corpus
[> 0                                                             [=] 95                                                           
Corpus has been loaded: 95 sentences, 1812 tokens
Save corpus
[> 0                                                             [=] 95                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 95                                                           
Corpus has been loaded: 95 sentences, 1812 tokens
Load corpus... 


133: _dataset/rucoref/rucoref_texts/Science/philology.ru-linguistics1-alpatov-12-out2.txt
Load corpus
[=] 46                                                           
Corpus has been loaded: 46 sentences, 1108 tokens
Processing corpus
100%|██████████| 46/46 [00:00<00:00, 94346.20it/s]

done.
Preprocess corpus
[> 0                                                             [=] 46                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 46 sentences, 1108 tokens
Save corpus
[> 0                                                             [=] 46                                                           
Corpus has been saved



Processing corpus
100%|██████████| 46/46 [00:00<00:00, 152279.39it/s]


Load corpus
[> 0                                                             [=] 46                                                           
Corpus has been loaded: 46 sentences, 1108 tokens
Save corpus
[> 0                                                             [=] 46                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 46                                                           
Corpus has been loaded: 46 sentences, 1108 tokens
Load corpus... 

token 1453: Ф. != Ф.Л
token 1453: Ф. != Ф.Л
token 1456 (.) is not found
token 1456 (.) is not found
token 2734: кто-то != кто
token 2734: кто-то != кто
token 2737 (-) is not found
token 2737 (-) is not found
token 2738 (то) is not found
token 2738 (то) is not found
token 4711: Н. != Н.Я
token 4711: Н. != Н.Я
token 4714 (.) is not found
token 4714 (.) is not found
token 4860: Н. != Н
token 4860: Н. != Н
token 4861 (.) is not found
token 4861 (.) is not found
token 4863: Я. != Я
token 4863: Я. != Я
token 4864 (.) is not found
token 4864 (.) is not found
token 5106: Н. != Н.Я
token 5106: Н. != Н.Я
token 5109 (.) is not found
token 5109 (.) is not found
token 5703: Н. != Н.Я
token 5703: Н. != Н.Я
token 5706 (.) is not found
token 5706 (.) is not found
token 5967: Н. != Н.Я
token 5967: Н. != Н.Я
token 5970 (.) is not found
token 5970 (.) is not found
token 6441: Н. != Н.Я
token 6441: Н. != Н.Я
token 6444 (.) is not found
token 6444 (.) is not found

134: _dataset/rucoref/rucoref_texts/Scien

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been processed: 1 documents, 37 paragraphs, 128 sentences, 2597 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been saved


100%|██████████| 128/128 [00:00<00:00, 137.43it/s]
Processing corpus
100%|██████████| 128/128 [00:00<00:00, 138.48it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been loaded: 128 sentences, 2597 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been loaded: 128 sentences, 2597 tokens
Load corpus... 


token 820: акад. != акад
token 820: акад. != акад
token 824 (.) is not found
token 824 (.) is not found
token 826: П. != П
token 826: П. != П
token 827 (.) is not found
token 827 (.) is not found
token 829: С. != С
token 829: С. != С
token 830 (.) is not found
token 830 (.) is not found
token 3081: Г. != Г.С
token 3081: Г. != Г.С
token 3084 (.) is not found
token 3084 (.) is not found
token 3096: Г. != Г.С
token 3096: Г. != Г.С
token 3099 (.) is not found
token 3099 (.) is not found
token 4013: Г. != Г.С
token 4013: Г. != Г.С
token 4016 (.) is not found
token 4016 (.) is not found
token 4467: Г. != Г.С
token 4467: Г. != Г.С
token 4470 (.) is not found
token 4470 (.) is not found
token 4736: Г. != Г.С
token 4736: Г. != Г.С
token 4739 (.) is not found
token 4739 (.) is not found
token 5587: Г. != Г.С
token 5587: Г. != Г.С
token 5590 (.) is not found
token 5590 (.) is not found
token 5635: в. != в
token 5635: в. != в
token 5636 (.) is not found
token 5636 (.) is not found
token 6543: Н. 

done.
Preprocess corpus
[> 0                                                             [=] 42                                                           
Corpus has been processed: 1 documents, 42 paragraphs, 42 sentences, 1018 tokens
Save corpus
[> 0                                                             [=] 42                                                           
Corpus has been saved


Processing corpus
100%|██████████| 42/42 [00:00<00:00, 97758.47it/s]


Load corpus
[> 0                                                             [=] 42                                                           
Corpus has been loaded: 42 sentences, 1018 tokens
Save corpus
[> 0                                                             [=] 42                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 42                                                           
Corpus has been loaded: 42 sentences, 1018 tokens
Load corpus... 

token 233: ток-шоу != ток
token 233: ток-шоу != ток
token 236 (-) is not found
token 236 (-) is not found
token 237 (шоу) is not found
token 237 (шоу) is not found
token 3110: О. != О.С.П.
token 3110: О. != О.С.П.
token 233: ток-шоу != ток
token 233: ток-шоу != ток
token 236 (-) is not found
token 236 (-) is not found
token 237 (шоу) is not found
token 237 (шоу) is not found

203: _dataset/rucoref/rucoref_texts/OFC/3.txt
Load corpus
[=] 12                                                           
Corpus has been loaded: 12 sentences, 318 tokens
Processing corpus
100%|██████████| 12/12 [00:00<00:00, 41187.93it/s]

done.
Preprocess corpus
[> 0                                                             [=] 12                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 12 sentences, 318 tokens
Save corpus
[> 0                                                             [=] 12                                                           
Corpus has been saved



Processing corpus
100%|██████████| 12/12 [00:00<00:00, 68853.14it/s]


Load corpus
[> 0                                                             [=] 12                                                           
Corpus has been loaded: 12 sentences, 318 tokens
Save corpus
[> 0                                                             [=] 12                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 12                                                           
Corpus has been loaded: 12 sentences, 318 tokens
Load corpus... 

token 434: 100-метровый != 100
token 434: 100-метровый != 100
token 437 (-) is not found
token 437 (-) is not found
token 438 (метровый) is not found
token 438 (метровый) is not found
token 434: 100-метровый != 100
token 434: 100-метровый != 100
token 437 (-) is not found
token 437 (-) is not found
token 438 (метровый) is not found
token 438 (метровый) is not found

204: _dataset/rucoref/rucoref_texts/OFC/4.txt
Load corpus
[=] 60                                                           
Corpus has been loaded: 60 sentences, 1304 tokens
Processing corpus
100%|██████████| 60/60 [00:00<00:00, 272062.96it/s]

done.
Preprocess corpus
[> 0                                                             [=] 60                                                           
Corpus has been processed: 1 documents, 55 paragraphs, 60 sentences, 1304 tokens
Save corpus
[> 0                                                             [=] 60                                                           
Corpus has been saved



Processing corpus
100%|██████████| 60/60 [00:00<00:00, 230667.50it/s]


Load corpus
[> 0                                                             [=] 60                                                           
Corpus has been loaded: 60 sentences, 1304 tokens
Save corpus
[> 0                                                             [=] 60                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 60                                                           
Corpus has been loaded: 60 sentences, 1304 tokens
Load corpus... 

token 1501 (–) is not found
token 1501 (–) is not found
token 1576: Уивера-старшего != Уивера
token 1576: Уивера-старшего != Уивера
token 1582 (-) is not found
token 1582 (-) is not found
token 1583 (старшего) is not found
token 1583 (старшего) is not found
token 2458: Уивер-младший != Уивер
token 2458: Уивер-младший != Уивер
token 2463 (-) is not found
token 2463 (-) is not found
token 2464 (младший) is not found
token 2464 (младший) is not found

205: _dataset/rucoref/rucoref_texts/OFC/5.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been loaded: 128 sentences, 1348 tokens
Processing corpus
  0%|          | 0/128 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been processed: 1 documents, 128 paragraphs, 128 sentences, 1348 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been saved


100%|██████████| 128/128 [00:00<00:00, 221.55it/s]
Processing corpus
100%|██████████| 128/128 [00:00<00:00, 219.02it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been loaded: 128 sentences, 1348 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 128                                                           
Corpus has been loaded: 128 sentences, 1348 tokens
Load corpus... 


token 1152: хеппи-энд != хеппи
token 1152: хеппи-энд != хеппи
token 1157 (-) is not found
token 1157 (-) is not found
token 1158 (энд) is not found
token 1158 (энд) is not found

207: _dataset/rucoref/rucoref_texts/OFC/7.txt
Load corpus
[=] 41                                                           
Corpus has been loaded: 41 sentences, 1139 tokens
Processing corpus
100%|██████████| 41/41 [00:00<00:00, 118434.20it/s]


done.
Preprocess corpus
[> 0                                                             [=] 41                                                           
Corpus has been processed: 1 documents, 41 paragraphs, 41 sentences, 1139 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved


Processing corpus
100%|██████████| 41/41 [00:00<00:00, 335872.00it/s]


Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 1139 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 1139 tokens
Load corpus... 


209: _dataset/rucoref/rucoref_texts/OFC/9.txt
Load corpus
[=] 25                                                           
Corpus has been loaded: 25 sentences, 583 tokens
Processing corpus
100%|██████████| 25/25 [00:00<00:00, 97541.95it/s]

done.
Preprocess corpus
[> 0                                                             [=] 25                                                           
Corpus has been processed: 1 documents, 25 paragraphs, 25 sentences, 583 tokens
Save corpus
[> 0                                                             [=] 25                                                           
Corpus has been saved



Processing corpus
100%|██████████| 25/25 [00:00<00:00, 62788.98it/s]


Load corpus
[> 0                                                             [=] 25                                                           
Corpus has been loaded: 25 sentences, 583 tokens
Save corpus
[> 0                                                             [=] 25                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 25                                                           
Corpus has been loaded: 25 sentences, 583 tokens
Load corpus... 

token 3640: ARPA-E != ARPA
token 3640: ARPA-E != ARPA
token 3644 (-) is not found
token 3644 (-) is not found
token 3645 (E) is not found
token 3645 (E) is not found

210: _dataset/rucoref/rucoref_texts/OFC/10.txt
Load corpus
[=] 32                                                           
Corpus has been loaded: 32 sentences, 730 tokens
Processing corpus
100%|██████████| 32/32 [00:00<00:00, 94987.78it/s]

done.
Preprocess corpus
[> 0                                                             [=] 32                                                           
Corpus has been processed: 1 documents, 32 paragraphs, 32 sentences, 730 tokens
Save corpus
[> 0                                                             [=] 32                                                           
Corpus has been saved



Processing corpus
100%|██████████| 32/32 [00:00<00:00, 123817.09it/s]


Load corpus
[> 0                                                             [=] 32                                                           
Corpus has been loaded: 32 sentences, 730 tokens
Save corpus
[> 0                                                             [=] 32                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 32                                                           
Corpus has been loaded: 32 sentences, 730 tokens
Load corpus... 


211: _dataset/rucoref/rucoref_texts/OFC/11.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1898 tokens
Processing corpus
  0%|          | 0/103 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been processed: 1 documents, 97 paragraphs, 103 sentences, 1898 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved


100%|██████████| 103/103 [00:00<00:00, 205.55it/s]
Processing corpus
100%|██████████| 103/103 [00:00<00:00, 204.15it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1898 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1898 tokens
Load corpus... 

token 3357: какие-то != какие
token 3357: какие-то != какие
token 3362 (-) is not found
token 3362 (-) is not found
token 3363 (то) is not found
token 3363 (то) is not found
token 4485: ВАЗ-2108 != ВАЗ
token 4485: ВАЗ-2108 != ВАЗ
token 4488 (-) is not found
token 4488 (-) is not found
token 4489 (2108) is not found
token 4489 (2108) is not found

212: _dataset/rucoref/rucoref_texts/OFC/12.txt
Load corpus
[=] 55                                                           
Corpus has been loaded: 55 sentences, 1215 tokens
Processing corpus
100%|██████████| 55/55 [00:00<00:00, 201825.65it/s]

done.
Preprocess corpus
[> 0                                                             [=] 55                                                           
Corpus has been processed: 1 documents, 54 paragraphs, 55 sentences, 1215 tokens
Save corpus
[> 0                                                             [=] 55                                                           
Corpus has been saved



Processing corpus
100%|██████████| 55/55 [00:00<00:00, 429584.21it/s]


Load corpus
[> 0                                                             [=] 55                                                           
Corpus has been loaded: 55 sentences, 1215 tokens
Save corpus
[> 0                                                             [=] 55                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 55                                                           
Corpus has been loaded: 55 sentences, 1215 tokens
Load corpus... 

token 2162 (—) is not found
token 2162 (—) is not found
token 3679: F-117А != F
token 3679: F-117А != F
token 3680 (-) is not found
token 3680 (-) is not found
token 3681 (117А) is not found
token 3681 (117А) is not found
token 3807: F-117 != F
token 3807: F-117 != F
token 3808 (-) is not found
token 3808 (-) is not found
token 3809 (117) is not found
token 3809 (117) is not found
token 4225: F-117 != F
token 4225: F-117 != F
token 4226 (-) is not found
token 4226 (-) is not found
token 4227 (117) is not found
token 4227 (117) is not found

215: _dataset/rucoref/rucoref_texts/OFC/15.txt
Load corpus
[=] 62                                                           
Corpus has been loaded: 62 sentences, 1126 tokens
Processing corpus
100%|██████████| 62/62 [00:00<00:00, 216525.27it/s]

done.
Preprocess corpus
[> 0                                                             [=] 62                                                           
Corpus has been processed: 1 documents, 61 paragraphs, 62 sentences, 1126 tokens
Save corpus
[> 0                                                             [=] 62                                                           
Corpus has been saved



Processing corpus
100%|██████████| 62/62 [00:00<00:00, 162427.76it/s]


Load corpus
[> 0                                                             [=] 62                                                           
Corpus has been loaded: 62 sentences, 1126 tokens
Save corpus
[> 0                                                             [=] 62                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 62                                                           
Corpus has been loaded: 62 sentences, 1126 tokens
Load corpus... 

token 125: 27-летняя != 27
token 125: 27-летняя != 27
token 127 (-) is not found
token 127 (-) is not found
token 128 (летняя) is not found
token 128 (летняя) is not found
token 680: Горно-Алтайска != Горно
token 680: Горно-Алтайска != Горно
token 685 (-) is not found
token 685 (-) is not found
token 686 (Алтайска) is not found
token 686 (Алтайска) is not found
token 680: Горно-Алтайска != Горно
token 680: Горно-Алтайска != Горно
token 685 (-) is not found
token 685 (-) is not found
token 686 (Алтайска) is not found
token 686 (Алтайска) is not found
token 2764: госпожой != г
token 2764: госпожой != г
token 2765 (-) is not found
token 2765 (-) is not found
token 2766 (жой) is not found
token 2766 (жой) is not found
token 3013: госпожи != г
token 3013: госпожи != г
token 3014 (-) is not found
token 3014 (-) is not found
token 3015 (жи) is not found
token 3015 (жи) is not found
token 3841: Горно-Алтайска != Горно
token 3841: Горно-Алтайска != Горно
token 3846 (-) is not found
token 3846 (

done.
Preprocess corpus
[> 0                                                             [=] 59                                                           
Corpus has been processed: 1 documents, 58 paragraphs, 59 sentences, 1241 tokens
Save corpus
[> 0                                                             [=] 59                                                           
Corpus has been saved



Processing corpus
100%|██████████| 59/59 [00:00<00:00, 239790.64it/s]


Load corpus
[> 0                                                             [=] 59                                                           
Corpus has been loaded: 59 sentences, 1241 tokens
Save corpus
[> 0                                                             [=] 59                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 59                                                           
Corpus has been loaded: 59 sentences, 1241 tokens
Load corpus... 


251: _dataset/rucoref/rucoref_texts/OFC/51.txt
Load corpus
[> 0                                                             [=] 100                                                           
Corpus has been loaded: 100 sentences, 1948 tokens
Processing corpus
  0%|          | 0/100 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 100                                                           
Corpus has been processed: 1 documents, 99 paragraphs, 100 sentences, 1948 tokens
Save corpus
[> 0                                                             [=] 100                                                           
Corpus has been saved


100%|██████████| 100/100 [00:00<00:00, 210.20it/s]
Processing corpus
100%|██████████| 100/100 [00:00<00:00, 210.59it/s]


Load corpus
[> 0                                                             [=] 100                                                           
Corpus has been loaded: 100 sentences, 1948 tokens
Save corpus
[> 0                                                             [=] 100                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 100                                                           
Corpus has been loaded: 100 sentences, 1948 tokens
Load corpus... 

token 642: Вояджером-2 != Вояджером
token 642: Вояджером-2 != Вояджером
token 651 (-) is not found
token 651 (-) is not found
token 652 (2) is not found
token 652 (2) is not found
token 863:  != 1986U2R
token 863:  != 1986U2R
token 870 (/) is not found
token 870 (/) is not found
token 871 (ζ) is not found
token 871 (ζ) is not found
token 883:  != α
token 883:  != α
token 886:  != β
token 886:  != β
token 889:  != η
token 889:  != η
token 892:  != γ
token 892:  != γ
token 895:  != δ
token 895:  != δ
token 898:  != λ
token 898:  != λ
token 901:  != ε
token 901:  != ε
token 904:  != ν
token 904:  != ν
token 908:  != μ
token 908:  != μ
token 943:  != 1986U2R
token 943:  != 1986U2R
token 950 (/) is not found
token 950 (/) is not found
token 951 (ζ) is not found
token 951 (ζ) is not found
token 988:  != μ
token 988:  != μ
token 1537:  != 1986U2R
token 1537:  != 1986U2R
token 1544 (/) is not found
token 1544 (/) is not found
token 1545 (ζ) is not found
token 1545 (ζ) is not found
token 1548: 

done.
Preprocess corpus
[> 0                                                             [=] 38                                                           
Corpus has been processed: 1 documents, 36 paragraphs, 38 sentences, 952 tokens
Save corpus
[> 0                                                             [=] 38                                                           
Corpus has been saved



Processing corpus
100%|██████████| 38/38 [00:00<00:00, 233358.06it/s]


Load corpus
[> 0                                                             [=] 38                                                           
Corpus has been loaded: 38 sentences, 952 tokens
Save corpus
[> 0                                                             [=] 38                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 38                                                           
Corpus has been loaded: 38 sentences, 952 tokens
Load corpus... 

token 1675: мега-популярная != мега
token 1675: мега-популярная != мега
token 1679 (-) is not found
token 1679 (-) is not found
token 1680 (популярная) is not found
token 1680 (популярная) is not found

253: _dataset/rucoref/rucoref_texts/OFC/53.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 164                                                           
Corpus has been loaded: 164 sentences, 3176 tokens
Processing corpus
  0%|          | 0/164 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 164                                                           
Corpus has been processed: 1 documents, 161 paragraphs, 164 sentences, 3176 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 164                                                           
Corpus has been saved


100%|██████████| 164/164 [00:00<00:00, 200.08it/s]
Processing corpus
100%|██████████| 164/164 [00:00<00:00, 198.69it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 164                                                           
Corpus has been loaded: 164 sentences, 3176 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 164                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 164                                                           
Corpus has been loaded: 164 sentences, 3176 tokens
Load corpus... 

token 3420: агууу-агууу != агууу
token 3420: агууу-агууу != агууу
token 3425 (-) is not found
token 3425 (-) is not found
token 3426 (агууу) is not found
token 3426 (агууу) is not found
token 2349: гаги-гребенушки != гаги
token 2349: гаги-гребенушки != гаги
token 2354 (гребенушки) is not found
token 2354 (гребенушки) is not found
token 18502:  != ἔ
token 18502:  != ἔ
token 18503 (ρ) is not found
token 18503 (ρ) is not found
token 18504 (ι) is not found
token 18504 (ι) is not found
token 18505 (ο) is not found
token 18505 (ο) is not found
token 18506 (ν) is not found
token 18506 (ν) is not found

254: _dataset/rucoref/rucoref_texts/OFC/54.txt
Load corpus
[> 0                                                             [=] 69                                                           
Corpus has been loaded: 69 sentences, 1272 tokens
Processing corpus
  0%|          | 0/69 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 69                                                           
Corpus has been processed: 1 documents, 67 paragraphs, 69 sentences, 1272 tokens
Save corpus
[> 0                                                             [=] 69                                                           
Corpus has been saved


100%|██████████| 69/69 [00:00<00:00, 181.46it/s]
Processing corpus
100%|██████████| 69/69 [00:00<00:00, 180.22it/s]

Load corpus
[> 0                                                             [=] 69                                                           
Corpus has been loaded: 69 sentences, 1272 tokens
Save corpus
[> 0                                                             [=] 69                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 69                                                           
Corpus has been loaded: 69 sentences, 1272 tokens
Load corpus... 


token 897: ток-шоу != ток
token 897: ток-шоу != ток
token 900 (-) is not found
token 900 (-) is not found
token 901 (шоу) is not found
token 901 (шоу) is not found
token 1228: какой-нибудь != какой
token 1228: какой-нибудь != какой
token 1233 (-) is not found
token 1233 (-) is not found
token 1234 (нибудь) is not found
token 1234 (нибудь) is not found

255: _dataset/rucoref/rucoref_texts/OFC/55.txt
Load corpus
[=] 50                                                           
Corpus has been loaded: 50 sentences, 831 tokens
Processing corpus
100%|██████████| 50/50 [00:00<00:00, 185260.78it/s]


done.
Preprocess corpus
[> 0                                                             [=] 50                                                           
Corpus has been processed: 1 documents, 49 paragraphs, 50 sentences, 831 tokens
Save corpus
[> 0                                                             [=] 50                                                           
Corpus has been saved


Processing corpus
100%|██████████| 50/50 [00:00<00:00, 167237.00it/s]


Load corpus
[> 0                                                             [=] 50                                                           
Corpus has been loaded: 50 sentences, 831 tokens
Save corpus
[> 0                                                             [=] 50                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 50                                                           
Corpus has been loaded: 50 sentences, 831 tokens
Load corpus... 


257: _dataset/rucoref/rucoref_texts/OFC/57.txt
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 286 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 67728.09it/s]
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 179572.65it/s]

done.
Preprocess corpus
[> 0                                                             [=] 14                                                           
Corpus has been processed: 1 documents, 14 paragraphs, 14 sentences, 286 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved





Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 286 tokens
Save corpus
[=] 14                                                           
Corpus has been saved
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 286 tokens
Load corpus... done.
Preprocess corpus
[=] 34                                                           
Corpus has been processed: 1 documents, 34 paragraphs, 34 sentences, 541 tokens
Save corpus
[=] 34                                                           
Corpus has been saved


token 63: индастриал-метал != индастриал
token 63: индастриал-метал != индастриал
token 73 (-) is not found
token 73 (-) is not found
token 74 (метал) is not found
token 74 (метал) is not found
token 108:  != für
token 108:  != für
token 63: индастриал-метал != индастриал
token 63: индастриал-метал != индастриал
token 73 (-) is not found
token 73 (-) is not found
token 74 (метал) is not found
token 74 (метал) is not found
token 302:  != für
token 302:  != für

264: _dataset/rucoref/rucoref_texts/OFC/64.txt
Load corpus
[=] 34                                                           
Corpus has been loaded: 34 sentences, 541 tokens
Processing corpus
100%|██████████| 34/34 [00:00<00:00, 76587.72it/s]
Processing corpus
100%|██████████| 34/34 [00:00<00:00, 123575.68it/s]


Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 541 tokens
Save corpus
[> 0                                                             [=] 34                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 34                                                           
Corpus has been loaded: 34 sentences, 541 tokens
Load corpus... 


265: _dataset/rucoref/rucoref_texts/OFC/65.txt
Load corpus
[=] 57                                                           
Corpus has been loaded: 57 sentences, 723 tokens
Processing corpus
100%|██████████| 57/57 [00:00<00:00, 519728.97it/s]

done.
Preprocess corpus
[> 0                                                             [=] 57                                                           
Corpus has been processed: 1 documents, 57 paragraphs, 57 sentences, 723 tokens
Save corpus
[> 0                                                             [=] 57                                                           
Corpus has been saved



Processing corpus
100%|██████████| 57/57 [00:00<00:00, 188694.02it/s]


Load corpus
[> 0                                                             [=] 57                                                           
Corpus has been loaded: 57 sentences, 723 tokens
Save corpus
[> 0                                                             [=] 57                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 57                                                           
Corpus has been loaded: 57 sentences, 723 tokens
Load corpus... 

token 2745: какое-то != какое
token 2745: какое-то != какое
token 2750 (-) is not found
token 2750 (-) is not found
token 2751 (то) is not found
token 2751 (то) is not found

266: _dataset/rucoref/rucoref_texts/OFC/66.txt
Load corpus
[=] 56                                                           
Corpus has been loaded: 56 sentences, 881 tokens
Processing corpus
100%|██████████| 56/56 [00:00<00:00, 300743.95it/s]

done.
Preprocess corpus
[> 0                                                             [=] 56                                                           
Corpus has been processed: 1 documents, 53 paragraphs, 56 sentences, 881 tokens
Save corpus
[> 0                                                             [=] 56                                                           
Corpus has been saved



Processing corpus
100%|██████████| 56/56 [00:00<00:00, 209528.12it/s]


Load corpus
[> 0                                                             [=] 56                                                           
Corpus has been loaded: 56 sentences, 881 tokens
Save corpus
[> 0                                                             [=] 56                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 56                                                           
Corpus has been loaded: 56 sentences, 881 tokens
Load corpus... 

token 2: Сан-Франциско != Сан
token 2: Сан-Франциско != Сан
token 5 (-) is not found
token 5 (-) is not found
token 6 (Франциско) is not found
token 6 (Франциско) is not found
token 220: Сан-Франциско != Сан
token 220: Сан-Франциско != Сан
token 223 (-) is not found
token 223 (-) is not found
token 224 (Франциско) is not found
token 224 (Франциско) is not found

267: _dataset/rucoref/rucoref_texts/OFC/67.txt
Load corpus
[> 0                                                             [=] 83                                                           
Corpus has been loaded: 83 sentences, 1401 tokens
Processing corpus
  0%|          | 0/83 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 83                                                           
Corpus has been processed: 1 documents, 83 paragraphs, 83 sentences, 1401 tokens
Save corpus
[> 0                                                             [=] 83                                                           
Corpus has been saved


100%|██████████| 83/83 [00:00<00:00, 223.18it/s]
Processing corpus
100%|██████████| 83/83 [00:00<00:00, 209.02it/s]

Load corpus
[> 0                                                             [=] 83                                                           
Corpus has been loaded: 83 sentences, 1401 tokens
Save corpus
[> 0                                                             [=] 83                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 83                                                           
Corpus has been loaded: 83 sentences, 1401 tokens
Load corpus... 



268: _dataset/rucoref/rucoref_texts/OFC/68.txt
Load corpus
[=] 147                                                           
Corpus has been loaded: 147 sentences, 3204 tokens
Processing corpus
  0%|          | 0/147 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 147                                                           
Corpus has been processed: 1 documents, 144 paragraphs, 147 sentences, 3204 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 147                                                           
Corpus has been saved


100%|██████████| 147/147 [00:00<00:00, 154.28it/s]
Processing corpus
100%|██████████| 147/147 [00:00<00:00, 153.34it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 147                                                           
Corpus has been loaded: 147 sentences, 3204 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 147                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 147                                                           
Corpus has been loaded: 147 sentences, 3204 tokens
Load corpus... 

token 431: Премьер-лиги != Премьер
token 431: Премьер-лиги != Премьер
token 438 (-) is not found
token 438 (-) is not found
token 439 (лиги) is not found
token 439 (лиги) is not found
token 1985: L != L&YR
token 1985: L != L&YR
token 1990: F. != F.C
token 1990: F. != F.C
token 11394: Премьер-лигу != Премьер
token 11394: Премьер-лигу != Премьер
token 11401 (-) is not found
token 11401 (-) is not found
token 11402 (лигу) is not found
token 11402 (лигу) is not found
token 12170: Премьер-лигу != Премьер
token 12170: Премьер-лигу != Премьер
token 12177 (-) is not found
token 12177 (-) is not found
token 12178 (лигу) is not found
token 12178 (лигу) is not found
token 12824: Премьер-лигу != Премьер
token 12824: Премьер-лигу != Премьер
token 12831 (-) is not found
token 12831 (-) is not found
token 12832 (лигу) is not found
token 12832 (лигу) is not found
token 12998: Премьер-лигу != Премьер
token 12998: Премьер-лигу != Премьер
token 13005 (-) is not found
token 13005 (-) is not found
token 13

done.
Preprocess corpus
[> 0                                                             [=] 36                                                           
Corpus has been processed: 1 documents, 36 paragraphs, 36 sentences, 653 tokens
Save corpus
[> 0                                                             [=] 36                                                           
Corpus has been saved



Processing corpus
100%|██████████| 36/36 [00:00<00:00, 164661.88it/s]


Load corpus
[> 0                                                             [=] 36                                                           
Corpus has been loaded: 36 sentences, 653 tokens
Save corpus
[> 0                                                             [=] 36                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 36                                                           
Corpus has been loaded: 36 sentences, 653 tokens
Load corpus... 

token 174: либерал-демократы != либерал
token 174: либерал-демократы != либерал
token 181 (-) is not found
token 181 (-) is not found
token 182 (демократы) is not found
token 182 (демократы) is not found
token 1231: Либерал-демократы != Либерал
token 1231: Либерал-демократы != Либерал
token 1238 (-) is not found
token 1238 (-) is not found
token 1239 (демократы) is not found
token 1239 (демократы) is not found
token 1475: Либерал-демократы != Либерал
token 1475: Либерал-демократы != Либерал
token 1482 (-) is not found
token 1482 (-) is not found
token 1483 (демократы) is not found
token 1483 (демократы) is not found
token 2013: Либерал-демократы != Либерал
token 2013: Либерал-демократы != Либерал
token 2020 (-) is not found
token 2020 (-) is not found
token 2021 (демократы) is not found
token 2021 (демократы) is not found
token 3077: либерал-демократами != либерал
token 3077: либерал-демократами != либерал
token 3084 (-) is not found
token 3084 (-) is not found
token 3085 (демократами)

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 139                                                           
Corpus has been processed: 1 documents, 136 paragraphs, 139 sentences, 1749 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 139                                                           
Corpus has been saved


100%|██████████| 139/139 [00:00<00:00, 239.00it/s]
Processing corpus
100%|██████████| 139/139 [00:00<00:00, 228.19it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 139                                                           
Corpus has been loaded: 139 sentences, 1749 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 139                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 139                                                           
Corpus has been loaded: 139 sentences, 1749 tokens
Load corpus... 



271: _dataset/rucoref/rucoref_texts/OFC/71.txt
Load corpus
[=] 25                                                           
Corpus has been loaded: 25 sentences, 613 tokens
Processing corpus
100%|██████████| 25/25 [00:00<00:00, 65782.69it/s]


done.
Preprocess corpus
[> 0                                                             [=] 25                                                           
Corpus has been processed: 1 documents, 24 paragraphs, 25 sentences, 613 tokens
Save corpus
[> 0                                                             [=] 25                                                           
Corpus has been saved


Processing corpus
100%|██████████| 25/25 [00:00<00:00, 86019.36it/s]


Load corpus
[> 0                                                             [=] 25                                                           
Corpus has been loaded: 25 sentences, 613 tokens
Save corpus
[> 0                                                             [=] 25                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 25                                                           
Corpus has been loaded: 25 sentences, 613 tokens
Load corpus... 

token 1215: Санкт-Петербурге != Санкт
token 1215: Санкт-Петербурге != Санкт
token 1220 (-) is not found
token 1220 (-) is not found
token 1221 (Петербурге) is not found
token 1221 (Петербурге) is not found
token 1458: Санкт-Петербурга != Санкт
token 1458: Санкт-Петербурга != Санкт
token 1463 (-) is not found
token 1463 (-) is not found
token 1464 (Петербурга) is not found
token 1464 (Петербурга) is not found
token 1593: Санкт-Петербурге != Санкт
token 1593: Санкт-Петербурге != Санкт
token 1599 (Петербурге) is not found
token 1599 (Петербурге) is not found
token 1835: Санкт-Петербурге != Санкт
token 1835: Санкт-Петербурге != Санкт
token 1840 (-) is not found
token 1840 (-) is not found
token 1841 (Петербурге) is not found
token 1841 (Петербурге) is not found
token 2457 (—) is not found
token 2457 (—) is not found
token 2459: 24-летних != 24
token 2459: 24-летних != 24
token 2461 (-) is not found
token 2461 (-) is not found
token 2462 (летних) is not found
token 2462 (летних) is not foun

done.
Preprocess corpus
[> 0                                                             [=] 88                                                           
Corpus has been processed: 1 documents, 86 paragraphs, 88 sentences, 1690 tokens
Save corpus
[> 0                                                             [=] 88                                                           
Corpus has been saved


100%|██████████| 88/88 [00:00<00:00, 164.43it/s]
Processing corpus
100%|██████████| 88/88 [00:00<00:00, 160.71it/s]

Load corpus
[> 0                                                             [=] 88                                                           
Corpus has been loaded: 88 sentences, 1690 tokens
Save corpus
[> 0                                                             [=] 88                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 88                                                           
Corpus has been loaded: 88 sentences, 1690 tokens
Load corpus... 



275: _dataset/rucoref/rucoref_texts/OFC/75.txt
Load corpus
[=] 42                                                           
Corpus has been loaded: 42 sentences, 1160 tokens
Processing corpus
100%|██████████| 42/42 [00:00<00:00, 159277.37it/s]

done.
Preprocess corpus
[> 0                                                             [=] 42                                                           
Corpus has been processed: 1 documents, 39 paragraphs, 42 sentences, 1160 tokens
Save corpus
[> 0                                                             [=] 42                                                           
Corpus has been saved



Processing corpus
100%|██████████| 42/42 [00:00<00:00, 367001.60it/s]


Load corpus
[> 0                                                             [=] 42                                                           
Corpus has been loaded: 42 sentences, 1160 tokens
Save corpus
[> 0                                                             [=] 42                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 42                                                           
Corpus has been loaded: 42 sentences, 1160 tokens
Load corpus... 

token 0: Пресс-секретарь != Пресс
token 0: Пресс-секретарь != Пресс
token 5 (-) is not found
token 5 (-) is not found
token 6 (секретарь) is not found
token 6 (секретарь) is not found
token 677: пресс-секретарь != пресс
token 677: пресс-секретарь != пресс
token 682 (-) is not found
token 682 (-) is not found
token 683 (секретарь) is not found
token 683 (секретарь) is not found
token 2184: CC-BY != CC
token 2184: CC-BY != CC
token 2186 (-) is not found
token 2186 (-) is not found
token 2187 (BY) is not found
token 2187 (BY) is not found
token 2189 (-) is not found
token 2189 (-) is not found
token 2190: -SA != SA
token 2190: -SA != SA
token 3214: CC-BY != CC
token 3214: CC-BY != CC
token 3216 (-) is not found
token 3216 (-) is not found
token 3217 (BY) is not found
token 3217 (BY) is not found
token 3219 (-) is not found
token 3219 (-) is not found
token 3220: -SA != SA
token 3220: -SA != SA
token 3279: CC-BY != CC
token 3279: CC-BY != CC
token 3281 (-) is not found
token 3281 (-) is no

done.
Preprocess corpus
[> 0                                                             [=] 24                                                           
Corpus has been processed: 1 documents, 23 paragraphs, 24 sentences, 430 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved



Processing corpus
100%|██████████| 24/24 [00:00<00:00, 202135.13it/s]


Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 430 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 430 tokens
Load corpus... 

token 361: Коста-Рика != Коста
token 361: Коста-Рика != Коста
token 366 (-) is not found
token 366 (-) is not found
token 367 (Рика) is not found
token 367 (Рика) is not found
token 1789: Коста != Коста.Рика
token 1789: Коста != Коста.Рика

279: _dataset/rucoref/rucoref_texts/OFC/79.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 107                                                           
Corpus has been loaded: 107 sentences, 2591 tokens
Processing corpus
  0%|          | 0/107 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 107                                                           
Corpus has been processed: 1 documents, 100 paragraphs, 107 sentences, 2591 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 107                                                           
Corpus has been saved


100%|██████████| 107/107 [00:00<00:00, 173.96it/s]
Processing corpus
100%|██████████| 107/107 [00:00<00:00, 174.25it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 107                                                           
Corpus has been loaded: 107 sentences, 2591 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 107                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 107                                                           
Corpus has been loaded: 107 sentences, 2591 tokens
Load corpus... 

token 0: Фри-джаз != Фри
token 0: Фри-джаз != Фри
token 3 (-) is not found
token 3 (-) is not found
token 4 (джаз) is not found
token 4 (джаз) is not found
token 586: фри-джаза != фри
token 586: фри-джаза != фри
token 589 (-) is not found
token 589 (-) is not found
token 590 (джаза) is not found
token 590 (джаза) is not found
token 833: Фри-джаз != Фри
token 833: Фри-джаз != Фри
token 836 (-) is not found
token 836 (-) is not found
token 837 (джаз) is not found
token 837 (джаз) is not found
token 1052: фри-джаза != фри
token 1052: фри-джаза != фри
token 1055 (-) is not found
token 1055 (-) is not found
token 1056 (джаза) is not found
token 1056 (джаза) is not found
token 1293: фри-джаза != фри
token 1293: фри-джаза != фри
token 1296 (-) is not found
token 1296 (-) is not found
token 1297 (джаза) is not found
token 1297 (джаза) is not found
token 1537: фри-джаз != фри
token 1537: фри-джаз != фри
token 1540 (-) is not found
token 1540 (-) is not found
token 1541 (джаз) is not found
token

done.
Preprocess corpus
[> 0                                                             [=] 48                                                           
Corpus has been processed: 1 documents, 48 paragraphs, 48 sentences, 763 tokens
Save corpus
[> 0                                                             [=] 48                                                           
Corpus has been saved



Processing corpus
100%|██████████| 48/48 [00:00<00:00, 420306.04it/s]


Load corpus
[> 0                                                             [=] 48                                                           
Corpus has been loaded: 48 sentences, 763 tokens
Save corpus
[> 0                                                             [=] 48                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 48                                                           
Corpus has been loaded: 48 sentences, 763 tokens
Load corpus... 


282: _dataset/rucoref/rucoref_texts/OFC/82.txt
Load corpus
[=] 20                                                           
Corpus has been loaded: 20 sentences, 385 tokens
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 93103.31it/s]
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 117983.23it/s]

done.
Preprocess corpus
[> 0                                                             [=] 20                                                           
Corpus has been processed: 1 documents, 20 paragraphs, 20 sentences, 385 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 385 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 385 tokens
Load corpus... 

token 41: Giove-A != Giove
token 41: Giove-A != Giove
token 46 (-) is not found
token 46 (-) is not found
token 47 (A) is not found
token 47 (A) is not found
token 154: Giove-A != Giove
token 154: Giove-A != Giove
token 159 (-) is not found
token 159 (-) is not found
token 160 (A) is not found
token 160 (A) is not found
token 174: GSTB-V2 != GSTB
token 174: GSTB-V2 != GSTB
token 178 (-) is not found
token 178 (-) is not found
token 179 (V2/A) is not found
token 179 (V2/A) is not found
token 229: Giove-A != Giove
token 229: Giove-A != Giove
token 234 (-) is not found
token 234 (-) is not found
token 235 (A) is not found
token 235 (A) is not found
token 531: Giove-A != Giove
token 531: Giove-A != Giove
token 536 (-) is not found
token 536 (-) is not found
token 537 (A) is not found
token 537 (A) is not found
token 2409 (») is not found
token 2409 (») is not found

288: _dataset/rucoref/rucoref_texts/OFC/88.txt
Load corpus
[=] 144                                                           

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 144                                                           
Corpus has been processed: 1 documents, 142 paragraphs, 144 sentences, 2179 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 144                                                           
Corpus has been saved


100%|██████████| 144/144 [00:00<00:00, 179.57it/s]
Processing corpus
100%|██████████| 144/144 [00:00<00:00, 178.54it/s]

Load corpus
[> 0                                                             [> 100                                                            [=] 144                                                           
Corpus has been loaded: 144 sentences, 2179 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 144                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 144                                                           
Corpus has been loaded: 144 sentences, 2179 tokens
Load corpus... 


token 44: ток-шоу != ток
token 44: ток-шоу != ток
token 47 (-) is not found
token 47 (-) is not found
token 48 (шоу) is not found
token 48 (шоу) is not found
token 719: ток-шоу != ток
token 719: ток-шоу != ток
token 722 (-) is not found
token 722 (-) is not found
token 723 (шоу) is not found
token 723 (шоу) is not found
token 3866: ток-шоу != ток
token 3866: ток-шоу != ток
token 3869 (-) is not found
token 3869 (-) is not found
token 3870 (шоу) is not found
token 3870 (шоу) is not found
token 4215: ток-шоу != ток
token 4215: ток-шоу != ток
token 4218 (-) is not found
token 4218 (-) is not found
token 4219 (шоу) is not found
token 4219 (шоу) is not found

290: _dataset/rucoref/rucoref_texts/OFC/90.txt
Load corpus
[=] 23                                                           
Corpus has been loaded: 23 sentences, 660 tokens
Processing corpus
100%|██████████| 23/23 [00:00<00:00, 54471.48it/s]


done.
Preprocess corpus
[> 0                                                             [=] 23                                                           
Corpus has been processed: 1 documents, 21 paragraphs, 23 sentences, 660 tokens
Save corpus
[> 0                                                             [=] 23                                                           
Corpus has been saved


Processing corpus
100%|██████████| 23/23 [00:00<00:00, 78239.25it/s]


Load corpus
[> 0                                                             [=] 23                                                           
Corpus has been loaded: 23 sentences, 660 tokens
Save corpus
[> 0                                                             [=] 23                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 23                                                           
Corpus has been loaded: 23 sentences, 660 tokens
Load corpus... 

token 378: russian-untouchables != russian
token 378: russian-untouchables != russian
token 385 (-) is not found
token 385 (-) is not found
token 386 (untouchables.com) is not found
token 386 (untouchables.com) is not found
token 2205 (—) is not found
token 2205 (—) is not found

295: _dataset/rucoref/rucoref_texts/OFC/95.txt
Load corpus
[=] 43                                                           
Corpus has been loaded: 43 sentences, 1136 tokens
Processing corpus
100%|██████████| 43/43 [00:00<00:00, 110444.01it/s]

done.
Preprocess corpus
[> 0                                                             [=] 43                                                           
Corpus has been processed: 1 documents, 39 paragraphs, 43 sentences, 1136 tokens
Save corpus
[> 0                                                             [=] 43                                                           
Corpus has been saved



Processing corpus
100%|██████████| 43/43 [00:00<00:00, 362158.78it/s]


Load corpus
[> 0                                                             [=] 43                                                           
Corpus has been loaded: 43 sentences, 1136 tokens
Save corpus
[> 0                                                             [=] 43                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 43                                                           
Corpus has been loaded: 43 sentences, 1136 tokens
Load corpus... 


300: _dataset/rucoref/rucoref_texts/OFC/100.txt
Load corpus
[=] 10                                                           
Corpus has been loaded: 10 sentences, 298 tokens
Processing corpus
100%|██████████| 10/10 [00:00<00:00, 80971.12it/s]
Processing corpus
100%|██████████| 10/10 [00:00<00:00, 38621.58it/s]

done.
Preprocess corpus
[> 0                                                             [=] 10                                                           
Corpus has been processed: 1 documents, 9 paragraphs, 10 sentences, 298 tokens
Save corpus
[> 0                                                             [=] 10                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 10                                                           
Corpus has been loaded: 10 sentences, 298 tokens
Save corpus
[> 0                                                             [=] 10                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 10                                                           
Corpus has been loaded: 10 sentences, 298 tokens
Load corpus... 


301: _dataset/rucoref/rucoref_texts/OFC/101.txt
Load corpus
[=] 60                                                           
Corpus has been loaded: 60 sentences, 1216 tokens
Processing corpus
100%|██████████| 60/60 [00:00<00:00, 486766.42it/s]


done.
Preprocess corpus
[> 0                                                             [=] 60                                                           
Corpus has been processed: 1 documents, 57 paragraphs, 60 sentences, 1216 tokens
Save corpus
[> 0                                                             [=] 60                                                           
Corpus has been saved


Processing corpus
100%|██████████| 60/60 [00:00<00:00, 157878.44it/s]


Load corpus
[> 0                                                             [=] 60                                                           
Corpus has been loaded: 60 sentences, 1216 tokens
Save corpus
[> 0                                                             [=] 60                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 60                                                           
Corpus has been loaded: 60 sentences, 1216 tokens
Load corpus... 

token 1379 (—) is not found
token 1379 (—) is not found
token 3458: пресс-конференции != пресс
token 3458: пресс-конференции != пресс
token 3463 (-) is not found
token 3463 (-) is not found
token 3464 (конференции) is not found
token 3464 (конференции) is not found
token 4484: бизнес-модель != бизнес
token 4484: бизнес-модель != бизнес
token 4490 (-) is not found
token 4490 (-) is not found
token 4491 (модель) is not found
token 4491 (модель) is not found
token 6568: кто-то != кто
token 6568: кто-то != кто
token 6571 (-) is not found
token 6571 (-) is not found
token 6572 (то) is not found
token 6572 (то) is not found

302: _dataset/rucoref/rucoref_texts/OFC/102.txt
Load corpus
[=] 37                                                           
Corpus has been loaded: 37 sentences, 955 tokens
Processing corpus
100%|██████████| 37/37 [00:00<00:00, 145717.60it/s]

done.
Preprocess corpus
[> 0                                                             [=] 37                                                           
Corpus has been processed: 1 documents, 34 paragraphs, 37 sentences, 955 tokens
Save corpus
[> 0                                                             [=] 37                                                           
Corpus has been saved



Processing corpus
100%|██████████| 37/37 [00:00<00:00, 166690.92it/s]


Load corpus
[> 0                                                             [=] 37                                                           
Corpus has been loaded: 37 sentences, 955 tokens
Save corpus
[> 0                                                             [=] 37                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 37                                                           
Corpus has been loaded: 37 sentences, 955 tokens
Load corpus... 

token 4290 (—) is not found
token 4290 (—) is not found
token 4292: 45-летних != 45
token 4292: 45-летних != 45
token 4294 (-) is not found
token 4294 (-) is not found
token 4295 (летних) is not found
token 4295 (летних) is not found

303: _dataset/rucoref/rucoref_texts/OFC/103.txt
Load corpus
[=] 50                                                           
Corpus has been loaded: 50 sentences, 1676 tokens
Processing corpus
100%|██████████| 50/50 [00:00<00:00, 423667.07it/s]

done.
Preprocess corpus
[> 0                                                             [=] 50                                                           
Corpus has been processed: 1 documents, 48 paragraphs, 50 sentences, 1676 tokens
Save corpus
[> 0                                                             [=] 50                                                           
Corpus has been saved



Processing corpus
100%|██████████| 50/50 [00:00<00:00, 183477.87it/s]


Load corpus
[> 0                                                             [=] 50                                                           
Corpus has been loaded: 50 sentences, 1676 tokens
Save corpus
[> 0                                                             [=] 50                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 50                                                           
Corpus has been loaded: 50 sentences, 1676 tokens
Load corpus... 


306: _dataset/rucoref/rucoref_texts/OFC/106.txt
Load corpus
[=] 15                                                           
Corpus has been loaded: 15 sentences, 329 tokens
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 41418.41it/s]
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 76725.07it/s]

done.
Preprocess corpus
[> 0                                                             [=] 15                                                           
Corpus has been processed: 1 documents, 15 paragraphs, 15 sentences, 329 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 329 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 329 tokens
Load corpus... 


310: _dataset/rucoref/rucoref_texts/OFC/110.txt
Load corpus
[=] 63                                                           
Corpus has been loaded: 63 sentences, 1211 tokens
Processing corpus
100%|██████████| 63/63 [00:00<00:00, 294912.00it/s]


done.
Preprocess corpus
[> 0                                                             [=] 63                                                           
Corpus has been processed: 1 documents, 60 paragraphs, 63 sentences, 1211 tokens
Save corpus
[> 0                                                             [=] 63                                                           
Corpus has been saved


Processing corpus
100%|██████████| 63/63 [00:00<00:00, 221122.30it/s]


Load corpus
[> 0                                                             [=] 63                                                           
Corpus has been loaded: 63 sentences, 1211 tokens
Save corpus
[> 0                                                             [=] 63                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 63                                                           
Corpus has been loaded: 63 sentences, 1211 tokens
Load corpus... 


311: _dataset/rucoref/rucoref_texts/OFC/111.txt
Load corpus
[=] 24                                                           
Corpus has been loaded: 24 sentences, 342 tokens
Processing corpus
100%|██████████| 24/24 [00:00<00:00, 92351.65it/s]
Processing corpus
100%|██████████| 24/24 [00:00<00:00, 136770.78it/s]

done.
Preprocess corpus
[> 0                                                             [=] 24                                                           
Corpus has been processed: 1 documents, 24 paragraphs, 24 sentences, 342 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 342 tokens
Save corpus
[> 0                                                             [=] 24                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 24                                                           
Corpus has been loaded: 24 sentences, 342 tokens
Load corpus... 

token 1626: премьер-министр != премьер
token 1626: премьер-министр != премьер
token 1633 (-) is not found
token 1633 (-) is not found
token 1634 (министр) is not found
token 1634 (министр) is not found
token 1651: аль-Гануши != аль
token 1651: аль-Гануши != аль
token 1654 (-) is not found
token 1654 (-) is not found
token 1655 (Гануши) is not found
token 1655 (Гануши) is not found
token 2109: премьер-министра != премьер
token 2109: премьер-министра != премьер
token 2116 (-) is not found
token 2116 (-) is not found
token 2117 (министра) is not found
token 2117 (министра) is not found

312: _dataset/rucoref/rucoref_texts/OFC/112.txt
Load corpus
[=] 49                                                           
Corpus has been loaded: 49 sentences, 936 tokens
Processing corpus
100%|██████████| 49/49 [00:00<00:00, 253104.55it/s]


done.
Preprocess corpus
[> 0                                                             [=] 49                                                           
Corpus has been processed: 1 documents, 45 paragraphs, 49 sentences, 936 tokens
Save corpus
[> 0                                                             [=] 49                                                           
Corpus has been saved


Processing corpus
100%|██████████| 49/49 [00:00<00:00, 136558.73it/s]


Load corpus
[> 0                                                             [=] 49                                                           
Corpus has been loaded: 49 sentences, 936 tokens
Save corpus
[> 0                                                             [=] 49                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 49                                                           
Corpus has been loaded: 49 sentences, 936 tokens
Load corpus... 

token 57: грузино-абхазского != грузино
token 57: грузино-абхазского != грузино
token 64 (-) is not found
token 64 (-) is not found
token 65 (абхазского) is not found
token 65 (абхазского) is not found
token 1018: премьер-министра != премьер
token 1018: премьер-министра != премьер
token 1025 (-) is not found
token 1025 (-) is not found
token 1026 (министра) is not found
token 1026 (министра) is not found
token 1273: грузино-абхазского != грузино
token 1273: грузино-абхазского != грузино
token 1280 (-) is not found
token 1280 (-) is not found
token 1281 (абхазского) is not found
token 1281 (абхазского) is not found

313: _dataset/rucoref/rucoref_texts/OFC/113.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 119                                                           
Corpus has been loaded: 119 sentences, 2059 tokens
Processing corpus
  0%|          | 0/119 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 119                                                           
Corpus has been processed: 1 documents, 119 paragraphs, 119 sentences, 2059 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 119                                                           
Corpus has been saved


100%|██████████| 119/119 [00:00<00:00, 303.94it/s]
Processing corpus
100%|██████████| 119/119 [00:00<00:00, 300.79it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 119                                                           
Corpus has been loaded: 119 sentences, 2059 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 119                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 119                                                           
Corpus has been loaded: 119 sentences, 2059 tokens
Load corpus... 


316: _dataset/rucoref/rucoref_texts/OFC/116.txt
Load corpus
[> 0                                                             [=] 81                                                           
Corpus has been loaded: 81 sentences, 1376 tokens
Processing corpus
  0%|          | 0/81 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 81                                                           
Corpus has been processed: 1 documents, 80 paragraphs, 81 sentences, 1376 tokens
Save corpus
[> 0                                                             [=] 81                                                           
Corpus has been saved


100%|██████████| 81/81 [00:00<00:00, 218.82it/s]
Processing corpus
100%|██████████| 81/81 [00:00<00:00, 216.58it/s]
token 1708: кому-то != кому
token 1708: кому-то != кому
token 1712 (-) is not found
token 1712 (-) is not found
token 1713 (то) is not found
token 1713 (то) is not found

318: _dataset/rucoref/rucoref_texts/OFC/118.txt


Load corpus
[> 0                                                             [=] 81                                                           
Corpus has been loaded: 81 sentences, 1376 tokens
Save corpus
[> 0                                                             [=] 81                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 81                                                           
Corpus has been loaded: 81 sentences, 1376 tokens
Load corpus... 

Load corpus
[=] 15                                                           
Corpus has been loaded: 15 sentences, 302 tokens
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 122640.47it/s]
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 74191.70it/s]

done.
Preprocess corpus
[> 0                                                             [=] 15                                                           
Corpus has been processed: 1 documents, 15 paragraphs, 15 sentences, 302 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 302 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 302 tokens
Load corpus... 

token 19: протон-антипротонный != протон
token 19: протон-антипротонный != протон
token 25 (-) is not found
token 25 (-) is not found
token 26 (антипротонный) is not found
token 26 (антипротонный) is not found
token 1024: электрон-вольт != электрон
token 1024: электрон-вольт != электрон
token 1032 (-) is not found
token 1032 (-) is not found
token 1033 (вольт) is not found
token 1033 (вольт) is not found

319: _dataset/rucoref/rucoref_texts/OFC/119.txt
Load corpus
[=] 30                                                           
Corpus has been loaded: 30 sentences, 579 tokens
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 79387.46it/s]


done.
Preprocess corpus
[> 0                                                             [=] 30                                                           
Corpus has been processed: 1 documents, 28 paragraphs, 30 sentences, 579 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved


Processing corpus
100%|██████████| 30/30 [00:00<00:00, 268865.64it/s]


Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 579 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 579 tokens
Load corpus... 


320: _dataset/rucoref/rucoref_texts/OFC/120.txt
Load corpus
[> 0                                                             [=] 69                                                           
Corpus has been loaded: 69 sentences, 1572 tokens
Processing corpus
  0%|          | 0/69 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 69                                                           
Corpus has been processed: 1 documents, 69 paragraphs, 69 sentences, 1572 tokens
Save corpus
[> 0                                                             [=] 69                                                           
Corpus has been saved


100%|██████████| 69/69 [00:00<00:00, 129.72it/s]
Processing corpus
100%|██████████| 69/69 [00:00<00:00, 129.11it/s]

Load corpus
[> 0                                                             [=] 69                                                           
Corpus has been loaded: 69 sentences, 1572 tokens
Save corpus
[> 0                                                             [=] 69                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 69                                                           
Corpus has been loaded: 69 sentences, 1572 tokens
Load corpus... 


token 321: Чачба-Шервашидзе != Чачба
token 321: Чачба-Шервашидзе != Чачба
token 326 (-) is not found
token 326 (-) is not found
token 327 (Шервашидзе) is not found
token 327 (Шервашидзе) is not found

321: _dataset/rucoref/rucoref_texts/OFC/121.txt
Load corpus
[> 0                                                             [=] 99                                                           
Corpus has been loaded: 99 sentences, 1771 tokens
Processing corpus
  0%|          | 0/99 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 99                                                           
Corpus has been processed: 1 documents, 97 paragraphs, 99 sentences, 1771 tokens
Save corpus
[> 0                                                             [=] 99                                                           
Corpus has been saved


100%|██████████| 99/99 [00:00<00:00, 250.61it/s]
Processing corpus
100%|██████████| 99/99 [00:00<00:00, 247.16it/s]


Load corpus
[> 0                                                             [=] 99                                                           
Corpus has been loaded: 99 sentences, 1771 tokens
Save corpus
[> 0                                                             [=] 99                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 99                                                           
Corpus has been loaded: 99 sentences, 1771 tokens
Load corpus... 

token 165: генерал-майор != генерал
token 165: генерал-майор != генерал
token 172 (-) is not found
token 172 (-) is not found
token 173 (майор) is not found
token 173 (майор) is not found
token 6971:  != dodger_37
token 6971:  != dodger_37

322: _dataset/rucoref/rucoref_texts/OFC/122.txt
Load corpus
[=] 33                                                           
Corpus has been loaded: 33 sentences, 702 tokens
Processing corpus
100%|██████████| 33/33 [00:00<00:00, 280186.30it/s]

done.
Preprocess corpus
[> 0                                                             [=] 33                                                           
Corpus has been processed: 1 documents, 30 paragraphs, 33 sentences, 702 tokens
Save corpus
[> 0                                                             [=] 33                                                           
Corpus has been saved



Processing corpus
100%|██████████| 33/33 [00:00<00:00, 257750.53it/s]


Load corpus
[> 0                                                             [=] 33                                                           
Corpus has been loaded: 33 sentences, 702 tokens
Save corpus
[> 0                                                             [=] 33                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 33                                                           
Corpus has been loaded: 33 sentences, 702 tokens
Load corpus... 


323: _dataset/rucoref/rucoref_texts/OFC/123.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been loaded: 174 sentences, 4509 tokens
Processing corpus
  0%|          | 0/174 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been processed: 1 documents, 166 paragraphs, 174 sentences, 4509 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been saved


100%|██████████| 174/174 [00:01<00:00, 137.77it/s]
Processing corpus
100%|██████████| 174/174 [00:01<00:00, 136.82it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been loaded: 174 sentences, 4509 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 174                                                           
Corpus has been loaded: 174 sentences, 4509 tokens
Load corpus... 

token 103: рок-группы != рок
token 103: рок-группы != рок
token 106 (-) is not found
token 106 (-) is not found
token 107 (группы) is not found
token 107 (группы) is not found
token 103: рок-группы != рок
token 103: рок-группы != рок
token 106 (-) is not found
token 106 (-) is not found
token 107 (группы) is not found
token 107 (группы) is not found
token 9318: Got != Gotta
token 9318: Got != Gotta
token 12169 (<) is not found
token 12169 (<) is not found
token 12170:  != Песня
token 12170:  != Песня
token 12175 (>) is not found
token 12175 (>) is not found
token 23628: хит-параде != хит
token 23628: хит-параде != хит
token 23631 (-) is not found
token 23631 (-) is not found
token 23632 (параде) is not found
token 23632 (параде) is not found

324: _dataset/rucoref/rucoref_texts/OFC/124.txt
Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 306 tokens
Processing corpus
100%|██████████| 16/16 [00:00<00:00, 133683.00it/s]

done.
Preprocess corpus
[> 0                                                             [=] 16                                                           
Corpus has been processed: 1 documents, 16 paragraphs, 16 sentences, 306 tokens
Save corpus
[> 0                                                             [=] 16                                                           
Corpus has been saved



Processing corpus
100%|██████████| 16/16 [00:00<00:00, 157163.62it/s]


Load corpus
[> 0                                                             [=] 16                                                           
Corpus has been loaded: 16 sentences, 306 tokens
Save corpus
[> 0                                                             [=] 16                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 16                                                           
Corpus has been loaded: 16 sentences, 306 tokens
Load corpus... 

token 0: 22-летний != 22
token 0: 22-летний != 22
token 2 (-) is not found
token 2 (-) is not found
token 3 (летний) is not found
token 3 (летний) is not found
token 1225: 40-летнюю != 40
token 1225: 40-летнюю != 40
token 1227 (-) is not found
token 1227 (-) is not found
token 1228 (летнюю) is not found
token 1228 (летнюю) is not found

328: _dataset/rucoref/rucoref_texts/OFC/128.txt
Load corpus
[=] 19                                                           
Corpus has been loaded: 19 sentences, 461 tokens
Processing corpus
100%|██████████| 19/19 [00:00<00:00, 162305.04it/s]
Processing corpus
100%|██████████| 19/19 [00:00<00:00, 52567.13it/s]

done.
Preprocess corpus
[> 0                                                             [=] 19                                                           
Corpus has been processed: 1 documents, 19 paragraphs, 19 sentences, 461 tokens
Save corpus
[> 0                                                             [=] 19                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 19                                                           
Corpus has been loaded: 19 sentences, 461 tokens
Save corpus
[> 0                                                             [=] 19                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 19                                                           
Corpus has been loaded: 19 sentences, 461 tokens
Load corpus... 


330: _dataset/rucoref/rucoref_texts/OFC/130.txt
Load corpus
[=] 35                                                           
Corpus has been loaded: 35 sentences, 743 tokens
Processing corpus
100%|██████████| 35/35 [00:00<00:00, 128434.51it/s]


done.
Preprocess corpus
[> 0                                                             [=] 35                                                           
Corpus has been processed: 1 documents, 35 paragraphs, 35 sentences, 743 tokens
Save corpus
[> 0                                                             [=] 35                                                           
Corpus has been saved


Processing corpus
100%|██████████| 35/35 [00:00<00:00, 93922.35it/s]


Load corpus
[> 0                                                             [=] 35                                                           
Corpus has been loaded: 35 sentences, 743 tokens
Save corpus
[> 0                                                             [=] 35                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 35                                                           
Corpus has been loaded: 35 sentences, 743 tokens
Load corpus... 

token 1098: iPad- != iPad
token 1098: iPad- != iPad
token 1622: iPad- != iPad
token 1622: iPad- != iPad
token 2015: вице-президент != вице
token 2015: вице-президент != вице
token 2019 (-) is not found
token 2019 (-) is not found
token 2020 (президент) is not found
token 2020 (президент) is not found
token 3165: вице-президент != вице
token 3165: вице-президент != вице
token 3169 (-) is not found
token 3169 (-) is not found
token 3170 (президент) is not found
token 3170 (президент) is not found

332: _dataset/rucoref/rucoref_texts/OFC/132.txt
Load corpus
[> 0                                                             [=] 67                                                           
Corpus has been loaded: 67 sentences, 1148 tokens
Processing corpus
  0%|          | 0/67 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 67                                                           
Corpus has been processed: 1 documents, 65 paragraphs, 67 sentences, 1148 tokens
Save corpus
[> 0                                                             [=] 67                                                           
Corpus has been saved


100%|██████████| 67/67 [00:00<00:00, 142.91it/s]
Processing corpus
100%|██████████| 67/67 [00:00<00:00, 141.09it/s]

Load corpus
[> 0                                                             [=] 67                                                           
Corpus has been loaded: 67 sentences, 1148 tokens
Save corpus
[> 0                                                             [=] 67                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 67                                                           
Corpus has been loaded: 67 sentences, 1148 tokens
Load corpus... 



336: _dataset/rucoref/rucoref_texts/OFC/136.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1213 tokens
Processing corpus
  0%|          | 0/103 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been processed: 1 documents, 103 paragraphs, 103 sentences, 1213 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved


100%|██████████| 103/103 [00:00<00:00, 350.26it/s]
Processing corpus
100%|██████████| 103/103 [00:00<00:00, 351.51it/s]
token 3136: каким-то != каким

Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1213 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1213 tokens
Load corpus... 


token 3136: каким-то != каким
token 3141 (-) is not found
token 3141 (-) is not found
token 3142 (то) is not found
token 3142 (то) is not found

338: _dataset/rucoref/rucoref_texts/OFC/138.txt
Load corpus
[=] 71                                                           
Corpus has been loaded: 71 sentences, 1319 tokens
Processing corpus
  0%|          | 0/71 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 71                                                           
Corpus has been processed: 1 documents, 69 paragraphs, 71 sentences, 1319 tokens
Save corpus
[> 0                                                             [=] 71                                                           
Corpus has been saved


100%|██████████| 71/71 [00:00<00:00, 172.62it/s]
Processing corpus
100%|██████████| 71/71 [00:00<00:00, 171.43it/s]

Load corpus
[> 0                                                             [=] 71                                                           
Corpus has been loaded: 71 sentences, 1319 tokens
Save corpus
[> 0                                                             [=] 71                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 71                                                           
Corpus has been loaded: 71 sentences, 1319 tokens
Load corpus... 


token 3330: вице-президенту != вице
token 3330: вице-президенту != вице
token 3334 (-) is not found
token 3334 (-) is not found
token 3335 (президенту) is not found
token 3335 (президенту) is not found
token 4029: Аль-Каиды != Аль
token 4029: Аль-Каиды != Аль
token 4032 (-) is not found
token 4032 (-) is not found
token 4033 (Каиды) is not found
token 4033 (Каиды) is not found
token 5573: Аль-Каиды != Аль
token 5573: Аль-Каиды != Аль
token 5576 (-) is not found
token 5576 (-) is not found
token 5577 (Каиды) is not found
token 5577 (Каиды) is not found
token 5573: Аль-Каиды != Аль
token 5573: Аль-Каиды != Аль
token 5576 (-) is not found
token 5576 (-) is not found
token 5577 (Каиды) is not found
token 5577 (Каиды) is not found
token 7997: Аль-Каидой != Аль
token 7997: Аль-Каидой != Аль
token 8000 (-) is not found
token 8000 (-) is not found
token 8001 (Каидой) is not found
token 8001 (Каидой) is not found

339: _dataset/rucoref/rucoref_texts/OFC/139.txt
Load corpus
[> 0               

done.
Preprocess corpus
[> 0                                                             [=] 67                                                           
Corpus has been processed: 1 documents, 64 paragraphs, 67 sentences, 1003 tokens
Save corpus
[> 0                                                             [=] 67                                                           
Corpus has been saved


100%|██████████| 67/67 [00:00<00:00, 121.11it/s]
Processing corpus
100%|██████████| 67/67 [00:00<00:00, 119.89it/s]

Load corpus
[> 0                                                             [=] 67                                                           
Corpus has been loaded: 67 sentences, 1003 tokens
Save corpus
[> 0                                                             [=] 67                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 67                                                           
Corpus has been loaded: 67 sentences, 1003 tokens
Load corpus... 


token 1498: Яндекс != Яндекс.Кошелек
token 1498: Яндекс != Яндекс.Кошелек

340: _dataset/rucoref/rucoref_texts/OFC/140.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 106                                                           
Corpus has been loaded: 106 sentences, 1480 tokens
Processing corpus
  0%|          | 0/106 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 106                                                           
Corpus has been processed: 1 documents, 101 paragraphs, 106 sentences, 1480 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 106                                                           
Corpus has been saved


100%|██████████| 106/106 [00:00<00:00, 301.99it/s]
Processing corpus
100%|██████████| 106/106 [00:00<00:00, 313.16it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 106                                                           
Corpus has been loaded: 106 sentences, 1480 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 106                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 106                                                           
Corpus has been loaded: 106 sentences, 1480 tokens
Load corpus... 

token 878 (–) is not found
token 878 (–) is not found
token 1663 (—) is not found
token 1663 (—) is not found
token 1720 (—) is not found
token 1720 (—) is not found
token 3763 (—) is not found
token 3763 (—) is not found
token 5889: какого-то != какого
token 5889: какого-то != какого
token 5895 (-) is not found
token 5895 (-) is not found
token 5896 (то) is not found
token 5896 (то) is not found
token 6955: Nim != Nim's
token 6955: Nim != Nim's
token 8401 (») is not found
token 8401 (») is not found

342: _dataset/rucoref/rucoref_texts/OFC/142.txt
Load corpus
[=] 60                                                           
Corpus has been loaded: 60 sentences, 1041 tokens
Processing corpus
100%|██████████| 60/60 [00:00<00:00, 251155.93it/s]

done.
Preprocess corpus
[> 0                                                             [=] 60                                                           
Corpus has been processed: 1 documents, 59 paragraphs, 60 sentences, 1041 tokens
Save corpus
[> 0                                                             [=] 60                                                           
Corpus has been saved



Processing corpus
100%|██████████| 60/60 [00:00<00:00, 250406.21it/s]


Load corpus
[> 0                                                             [=] 60                                                           
Corpus has been loaded: 60 sentences, 1041 tokens
Save corpus
[> 0                                                             [=] 60                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 60                                                           
Corpus has been loaded: 60 sentences, 1041 tokens
Load corpus... 

token 470: премьер-министр != премьер
token 470: премьер-министр != премьер
token 477 (-) is not found
token 477 (-) is not found
token 478 (министр) is not found
token 478 (министр) is not found
token 940: премьер-министром != премьер
token 940: премьер-министром != премьер
token 947 (-) is not found
token 947 (-) is not found
token 948 (министром) is not found
token 948 (министром) is not found
token 1424: премьер-министра != премьер
token 1424: премьер-министра != премьер
token 1431 (-) is not found
token 1431 (-) is not found
token 1432 (министра) is not found
token 1432 (министра) is not found
token 1867: премьер-министром != премьер
token 1867: премьер-министром != премьер
token 1874 (-) is not found
token 1874 (-) is not found
token 1875 (министром) is not found
token 1875 (министром) is not found
token 2120: премьер-министром != премьер
token 2120: премьер-министром != премьер
token 2127 (-) is not found
token 2127 (-) is not found
token 2128 (министром) is not found
token 2128

done.
Preprocess corpus
[> 0                                                             [=] 13                                                           
Corpus has been processed: 1 documents, 13 paragraphs, 13 sentences, 265 tokens
Save corpus
[> 0                                                             [=] 13                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 13                                                           
Corpus has been loaded: 13 sentences, 265 tokens
Save corpus
[> 0                                                             [=] 13                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 13                                                           
Corpus has been loaded: 13 sentences, 265 tokens
Load corpus... 

token 52: Формулы-1 != Формулы
token 52: Формулы-1 != Формулы
token 59 (-) is not found
token 59 (-) is not found
token 60 (1) is not found
token 60 (1) is not found
token 1621: Формулы-1 != Формулы
token 1621: Формулы-1 != Формулы
token 1628 (-) is not found
token 1628 (-) is not found
token 1629 (1) is not found
token 1629 (1) is not found

700: _dataset/rucoref/rucoref_texts/newsru/sunna.txt
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 298 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 44283.75it/s]
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 55136.39it/s]

done.
Preprocess corpus
[> 0                                                             [=] 14                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 14 sentences, 298 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 298 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 298 tokens
Load corpus... 


701: _dataset/rucoref/rucoref_texts/newsru/duma_time.txt
Load corpus
[=] 19                                                           
Corpus has been loaded: 19 sentences, 374 tokens
Processing corpus
100%|██████████| 19/19 [00:00<00:00, 52359.91it/s]
Processing corpus
100%|██████████| 19/19 [00:00<00:00, 191566.77it/s]

done.
Preprocess corpus
[> 0                                                             [=] 19                                                           
Corpus has been processed: 1 documents, 10 paragraphs, 19 sentences, 374 tokens
Save corpus
[> 0                                                             [=] 19                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 19                                                           
Corpus has been loaded: 19 sentences, 374 tokens
Save corpus
[> 0                                                             [=] 19                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 19                                                           
Corpus has been loaded: 19 sentences, 374 tokens
Load corpus... 


702: _dataset/rucoref/rucoref_texts/newsru/m_l_king.txt
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 272 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 36269.46it/s]
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 141154.46it/s]

done.
Preprocess corpus
[> 0                                                             [=] 14                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 14 sentences, 272 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved





Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 272 tokens
Save corpus
[=] 14                                                           
Corpus has been saved
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 272 tokens
Load corpus... 


710: _dataset/rucoref/rucoref_texts/lentaru/lentaru016.txt
Load corpus
[=] 10                                                           
Corpus has been loaded: 10 sentences, 207 tokens
Processing corpus
100%|██████████| 10/10 [00:00<00:00, 7891.45it/s]
Processing corpus
100%|██████████| 10/10 [00:00<00:00, 38130.04it/s]


done.
Preprocess corpus
[> 0                                                             [=] 10                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 10 sentences, 207 tokens
Save corpus
[> 0                                                             [=] 10                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 10                                                           
Corpus has been loaded: 10 sentences, 207 tokens
Save corpus
[> 0                                                             [=] 10                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 10                                                           
Corpus has been loaded: 10 sentences, 207 tokens
Load corpus... 


711: _dataset/rucoref/rucoref_texts/lentaru/lentaru005.txt
Load corpus
[=] 11                                                           
Corpus has been loaded: 11 sentences, 194 tokens
Processing corpus
100%|██████████| 11/11 [00:00<00:00, 51898.02it/s]

done.
Preprocess corpus
[> 0                                                             [=] 11                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 11 sentences, 194 tokens
Save corpus
[> 0                                                             [=] 11                                                           
Corpus has been saved



Processing corpus
100%|██████████| 11/11 [00:00<00:00, 108814.49it/s]


Load corpus
[> 0                                                             [=] 11                                                           
Corpus has been loaded: 11 sentences, 194 tokens
Save corpus
[> 0                                                             [=] 11                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 11                                                           
Corpus has been loaded: 11 sentences, 194 tokens
Load corpus... 

token 166: 55-летний != 55
token 166: 55-летний != 55
token 168 (-) is not found
token 168 (-) is not found
token 169 (летний) is not found
token 169 (летний) is not found

712: _dataset/rucoref/rucoref_texts/lentaru/lentaru019.txt
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 330 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 154121.41it/s]


done.
Preprocess corpus
[> 0                                                             [=] 14                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 14 sentences, 330 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved


Processing corpus
100%|██████████| 14/14 [00:00<00:00, 42955.56it/s]


Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 330 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 330 tokens
Load corpus... 


713: _dataset/rucoref/rucoref_texts/lentaru/lentaru015.txt
Load corpus
[=] 18                                                           
Corpus has been loaded: 18 sentences, 335 tokens
Processing corpus
100%|██████████| 18/18 [00:00<00:00, 35848.75it/s]
Processing corpus
100%|██████████| 18/18 [00:00<00:00, 76569.44it/s]

done.
Preprocess corpus
[> 0                                                             [=] 18                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 18 sentences, 335 tokens
Save corpus
[> 0                                                             [=] 18                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 18                                                           
Corpus has been loaded: 18 sentences, 335 tokens
Save corpus
[> 0                                                             [=] 18                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 18                                                           
Corpus has been loaded: 18 sentences, 335 tokens
Load corpus... 

token 34: Санкт-Петербурга != Санкт
token 34: Санкт-Петербурга != Санкт
token 39 (-) is not found
token 39 (-) is not found
token 40 (Петербурга) is not found
token 40 (Петербурга) is not found
token 34: Санкт-Петербурга != Санкт
token 34: Санкт-Петербурга != Санкт
token 39 (-) is not found
token 39 (-) is not found
token 40 (Петербурга) is not found
token 40 (Петербурга) is not found
token 34: Санкт-Петербурга != Санкт
token 34: Санкт-Петербурга != Санкт
token 39 (-) is not found
token 39 (-) is not found
token 40 (Петербурга) is not found
token 40 (Петербурга) is not found
token 198: Фонтанка != Фонтанка.ру
token 198: Фонтанка != Фонтанка.ру
token 979: Фонтанка != Фонтанка.ру
token 979: Фонтанка != Фонтанка.ру

714: _dataset/rucoref/rucoref_texts/lentaru/lentaru018.txt
Load corpus
[=] 11                                                           
Corpus has been loaded: 11 sentences, 243 tokens
Processing corpus
100%|██████████| 11/11 [00:00<00:00, 27139.61it/s]
Processing corpus
100%

done.
Preprocess corpus
[=] 11                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 11 sentences, 243 tokens
Save corpus
[=] 11                                                           
Corpus has been saved
Load corpus
[=] 11                                                           
Corpus has been loaded: 11 sentences, 243 tokens
Save corpus
[=] 11                                                           
Corpus has been saved
Load corpus
[=] 11                                                           
Corpus has been loaded: 11 sentences, 243 tokens
Load corpus... done.
Preprocess corpus
[=] 20                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 20 sentences, 324 tokens
Save corpus
[=] 20                                                           
Corpus has been saved



token 155: 59-летний != 59
token 155: 59-летний != 59
token 157 (-) is not found
token 157 (-) is not found
token 158 (летний) is not found
token 158 (летний) is not found

715: _dataset/rucoref/rucoref_texts/lentaru/lentaru006.txt
Load corpus
[=] 20                                                           
Corpus has been loaded: 20 sentences, 324 tokens
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 30908.65it/s]
Processing corpus
100%|██████████| 20/20 [00:00<00:00, 66841.50it/s]


Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 324 tokens
Save corpus
[> 0                                                             [=] 20                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 20                                                           
Corpus has been loaded: 20 sentences, 324 tokens
Load corpus... 


716: _dataset/rucoref/rucoref_texts/lentaru/lentaru007.txt
Load corpus
[=] 17                                                           
Corpus has been loaded: 17 sentences, 294 tokens
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 44315.21it/s]
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 148239.43it/s]

done.
Preprocess corpus
[> 0                                                             [=] 17                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 17 sentences, 294 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 294 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 294 tokens
Load corpus... 

token 778: 25-летний != 25
token 778: 25-летний != 25
token 780 (-) is not found
token 780 (-) is not found
token 781 (летний) is not found
token 781 (летний) is not found
token 798: 19-летний != 19
token 798: 19-летний != 19
token 800 (-) is not found
token 800 (-) is not found
token 801 (летний) is not found
token 801 (летний) is not found
token 1292: 109-я != 109
token 1292: 109-я != 109
token 1295 (-) is not found
token 1295 (-) is not found
token 1296 (я) is not found
token 1296 (я) is not found

717: _dataset/rucoref/rucoref_texts/lentaru/lentaru010.txt
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 273 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 41675.13it/s]
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 80549.05it/s]

done.
Preprocess corpus
[> 0                                                             [=] 14                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 14 sentences, 273 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 273 tokens
Save corpus
[> 0                                                             [=] 14                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 14                                                           
Corpus has been loaded: 14 sentences, 273 tokens
Load corpus... 

token 282: Чурюмова-Герасименко != Чурюмова
token 282: Чурюмова-Герасименко != Чурюмова
token 290 (-) is not found
token 290 (-) is not found
token 291 (Герасименко) is not found
token 291 (Герасименко) is not found
token 1330: Чурюмова-Герасименко != Чурюмова
token 1330: Чурюмова-Герасименко != Чурюмова
token 1338 (-) is not found
token 1338 (-) is not found
token 1339 (Герасименко) is not found
token 1339 (Герасименко) is not found

718: _dataset/rucoref/rucoref_texts/lentaru/lentaru014.txt
Load corpus
[=] 30                                                           
Corpus has been loaded: 30 sentences, 360 tokens
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 129854.61it/s]
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 305410.49it/s]

done.
Preprocess corpus
[> 0                                                             [=] 30                                                           
Corpus has been processed: 1 documents, 19 paragraphs, 30 sentences, 360 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 360 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 360 tokens
Load corpus... 


719: _dataset/rucoref/rucoref_texts/lentaru/lentaru008.txt
Load corpus
[=] 13                                                           
Corpus has been loaded: 13 sentences, 217 tokens
Processing corpus
100%|██████████| 13/13 [00:00<00:00, 23852.12it/s]
Processing corpus
100%|██████████| 13/13 [00:00<00:00, 40691.01it/s]

done.
Preprocess corpus
[=] 13                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 13 sentences, 217 tokens
Save corpus
[=] 13                                                           
Corpus has been saved
Load corpus
[=] 13                                                           
Corpus has been loaded: 13 sentences, 217 tokens
Save corpus
[=] 13                                                           
Corpus has been saved
Load corpus
[=] 13                                                           
Corpus has been loaded: 13 sentences, 217 tokens
Load corpus... done.
Preprocess corpus
[> 0                                                             



720: _dataset/rucoref/rucoref_texts/lentaru/lentaru011.txt
Load corpus
[=] 30                                                           
Corpus has been loaded: 30 sentences, 588 tokens
Processing corpus
100%|██████████| 30/30 [00:00<00:00, 43812.37it/s]

[=] 30                                                           
Corpus has been processed: 1 documents, 9 paragraphs, 30 sentences, 588 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved



Processing corpus
100%|██████████| 30/30 [00:00<00:00, 120989.54it/s]


Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 588 tokens
Save corpus
[> 0                                                             [=] 30                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 30                                                           
Corpus has been loaded: 30 sentences, 588 tokens
Load corpus... 

token 177: пресс-секретарь != пресс
token 177: пресс-секретарь != пресс
token 182 (-) is not found
token 182 (-) is not found
token 183 (секретарь) is not found
token 183 (секретарь) is not found
token 230 (@) is not found
token 230 (@) is not found
token 231: ссылка != durov
token 231: ссылка != durov
token 450: пресс-секретаря != пресс
token 450: пресс-секретаря != пресс
token 455 (-) is not found
token 455 (-) is not found
token 456 (секретаря) is not found
token 456 (секретаря) is not found
token 1381: пресс-секретаря != пресс
token 1381: пресс-секретаря != пресс
token 1386 (-) is not found
token 1386 (-) is not found
token 1387 (секретаря) is not found
token 1387 (секретаря) is not found

721: _dataset/rucoref/rucoref_texts/lentaru/lentaru012.txt
Load corpus
[=] 18                                                           
Corpus has been loaded: 18 sentences, 328 tokens
Processing corpus
100%|██████████| 18/18 [00:00<00:00, 86778.70it/s]
Processing corpus
100%|██████████| 18/18 [

done.
Preprocess corpus
[> 0                                                             [=] 18                                                           
Corpus has been processed: 1 documents, 5 paragraphs, 18 sentences, 328 tokens
Save corpus
[> 0                                                             [=] 18                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 18                                                           
Corpus has been loaded: 18 sentences, 328 tokens
Save corpus
[> 0                                                             [=] 18                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 18                                                           
Corpus has been loaded: 18 sentences, 328 tokens
Load corpus... 


722: _dataset/rucoref/rucoref_texts/lentaru/lentaru003.txt
Load corpus
[=] 11                                                           
Corpus has been loaded: 11 sentences, 234 tokens
Processing corpus
100%|██████████| 11/11 [00:00<00:00, 36616.94it/s]
Processing corpus
100%|██████████| 11/11 [00:00<00:00, 60152.99it/s]

done.
Preprocess corpus
[=] 11                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 11 sentences, 234 tokens
Save corpus
[=] 11                                                           
Corpus has been saved
Load corpus
[=] 11                                                           
Corpus has been loaded: 11 sentences, 234 tokens
Save corpus
[=] 11                                                           
Corpus has been saved
Load corpus
[=] 11                                                           
Corpus has been loaded: 11 sentences, 234 tokens
Load corpus... 



723: _dataset/rucoref/rucoref_texts/lentaru/lentaru013.txt
Load corpus
[=] 8                                                            
Corpus has been loaded: 8 sentences, 166 tokens
Processing corpus
  0%|          | 0/8 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 8                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 8 sentences, 166 tokens
Save corpus
[> 0                                                             [=] 8                                                           
Corpus has been saved


100%|██████████| 8/8 [00:00<00:00, 15880.00it/s]
Processing corpus
100%|██████████| 8/8 [00:00<00:00, 19097.57it/s]


Load corpus
[> 0                                                             [=] 8                                                           
Corpus has been loaded: 8 sentences, 166 tokens
Save corpus
[> 0                                                             [=] 8                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 8                                                           
Corpus has been loaded: 8 sentences, 166 tokens
Load corpus... 

token 886: этно-религиозного != этно
token 886: этно-религиозного != этно
token 890 (-) is not found
token 890 (-) is not found
token 891 (религиозного) is not found
token 891 (религиозного) is not found

724: _dataset/rucoref/rucoref_texts/lentaru/lentaru001.txt
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 485 tokens
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 218660.40it/s]

done.
Preprocess corpus
[> 0                                                             [=] 22                                                           
Corpus has been processed: 1 documents, 9 paragraphs, 22 sentences, 485 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved



Processing corpus
100%|██████████| 22/22 [00:00<00:00, 124527.24it/s]


Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 485 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 485 tokens
Load corpus... 


725: _dataset/rucoref/rucoref_texts/lentaru/lentaru002.txt
Load corpus
[=] 15                                                           
Corpus has been loaded: 15 sentences, 346 tokens
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 43299.77it/s]
Processing corpus
100%|██████████| 15/15 [00:00<00:00, 109607.25it/s]

done.
Preprocess corpus
[> 0                                                             [=] 15                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 15 sentences, 346 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 346 tokens
Save corpus
[> 0                                                             [=] 15                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 15                                                           
Corpus has been loaded: 15 sentences, 346 tokens
Load corpus... 


726: _dataset/rucoref/rucoref_texts/lentaru/lentaru020.txt
Load corpus
[=] 17                                                           
Corpus has been loaded: 17 sentences, 348 tokens
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 172646.90it/s]
Processing corpus
100%|██████████| 17/17 [00:00<00:00, 116318.38it/s]

done.
Preprocess corpus
[> 0                                                             [=] 17                                                           
Corpus has been processed: 1 documents, 8 paragraphs, 17 sentences, 348 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 348 tokens
Save corpus
[> 0                                                             [=] 17                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 17                                                           
Corpus has been loaded: 17 sentences, 348 tokens
Load corpus... 


727: _dataset/rucoref/rucoref_texts/lentaru/lentaru004.txt
Load corpus
[=] 22                                                           
Corpus has been loaded: 22 sentences, 429 tokens
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 79891.50it/s]
Processing corpus
100%|██████████| 22/22 [00:00<00:00, 89240.51it/s]

done.
Preprocess corpus
[> 0                                                             [=] 22                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 22 sentences, 429 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 429 tokens
Save corpus
[> 0                                                             [=] 22                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 22                                                           
Corpus has been loaded: 22 sentences, 429 tokens
Load corpus... 

token 200: Kuk-ryol != Kuk
token 200: Kuk-ryol != Kuk
token 203 (-) is not found
token 203 (-) is not found
token 204 (ryol) is not found
token 204 (ryol) is not found

728: _dataset/rucoref/rucoref_texts/lentaru/lentaru017.txt
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 253 tokens
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 68042.01it/s]
Processing corpus
100%|██████████| 14/14 [00:00<00:00, 146800.64it/s]

done.
Preprocess corpus
[=] 14                                                           
Corpus has been processed: 1 documents, 6 paragraphs, 14 sentences, 253 tokens
Save corpus
[=] 14                                                           
Corpus has been saved
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 253 tokens
Save corpus
[=] 14                                                           
Corpus has been saved
Load corpus
[=] 14                                                           
Corpus has been loaded: 14 sentences, 253 tokens
Load corpus... done.
Preprocess corpus
[=] 5                                                            


token 159: 26-летнее != 26
token 159: 26-летнее != 26
token 161 (-) is not found
token 161 (-) is not found
token 162 (летнее) is not found
token 162 (летнее) is not found
token 306: Лайси-Тауншипе != Лайси
token 306: Лайси-Тауншипе != Лайси
token 311 (-) is not found
token 311 (-) is not found
token 312 (Тауншипе) is not found
token 312 (Тауншипе) is not found

729: _dataset/rucoref/rucoref_texts/lentaru/lentaru009.txt
Load corpus
[=] 5                                                            
Corpus has been loaded: 5 sentences, 121 tokens
Processing corpus
100%|██████████| 5/5 [00:00<00:00, 43781.88it/s]



Corpus has been processed: 1 documents, 3 paragraphs, 5 sentences, 121 tokens
Save corpus
[> 0                                                             [=] 5                                                           
Corpus has been saved


Processing corpus
100%|██████████| 5/5 [00:00<00:00, 21269.29it/s]

Load corpus
[> 0                                                             [=] 5                                                           
Corpus has been loaded: 5 sentences, 121 tokens
Save corpus
[> 0                                                             [=] 5                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 5                                                           
Corpus has been loaded: 5 sentences, 121 tokens
Load corpus... 


token 182: 89-летний != 89
token 182: 89-летний != 89
token 184 (-) is not found
token 184 (-) is not found
token 185 (летний) is not found
token 185 (летний) is not found
token 470: 13-миллионной != 13
token 470: 13-миллионной != 13
token 472 (-) is not found
token 472 (-) is not found
token 473 (миллионной) is not found
token 473 (миллионной) is not found

730: _dataset/rucoref/rucoref_texts/libru/kowalewskij.txt
Load corpus
[=] 27                                                           
Corpus has been loaded: 27 sentences, 407 tokens
Processing corpus
100%|██████████| 27/27 [00:00<00:00, 42208.80it/s]

done.
Preprocess corpus
[> 0                                                             [=] 27                                                           
Corpus has been processed: 1 documents, 11 paragraphs, 27 sentences, 407 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved



Processing corpus
100%|██████████| 27/27 [00:00<00:00, 176395.96it/s]


Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 407 tokens
Save corpus
[> 0                                                             [=] 27                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 27                                                           
Corpus has been loaded: 27 sentences, 407 tokens
Load corpus... 


731: _dataset/rucoref/rucoref_texts/libru/ermolaew2.txt
Load corpus
[=] 18                                                           
Corpus has been loaded: 18 sentences, 411 tokens
Processing corpus
100%|██████████| 18/18 [00:00<00:00, 21651.12it/s]
Processing corpus
100%|██████████| 18/18 [00:00<00:00, 83979.39it/s]

done.
Preprocess corpus
[> 0                                                             [=] 18                                                           
Corpus has been processed: 1 documents, 8 paragraphs, 18 sentences, 411 tokens
Save corpus
[> 0                                                             [=] 18                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 18                                                           
Corpus has been loaded: 18 sentences, 411 tokens
Save corpus
[> 0                                                             [=] 18                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 18                                                           
Corpus has been loaded: 18 sentences, 411 tokens
Load corpus... 


732: _dataset/rucoref/rucoref_texts/libru/ufo.txt
Load corpus
[=] 41                                                           
Corpus has been loaded: 41 sentences, 701 tokens
Processing corpus
100%|██████████| 41/41 [00:00<00:00, 128046.51it/s]

done.
Preprocess corpus
[> 0                                                             [=] 41                                                           
Corpus has been processed: 1 documents, 10 paragraphs, 41 sentences, 701 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved



Processing corpus
100%|██████████| 41/41 [00:00<00:00, 362034.66it/s]


Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 701 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 701 tokens
Load corpus... 


733: _dataset/rucoref/rucoref_texts/libru/filippow.txt
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1005 tokens
Processing corpus
  0%|          | 0/103 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been processed: 1 documents, 26 paragraphs, 103 sentences, 1005 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved


100%|██████████| 103/103 [00:00<00:00, 477.39it/s]
Processing corpus
100%|██████████| 103/103 [00:00<00:00, 475.32it/s]


Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1005 tokens
Save corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been saved
Load corpus
[> 0                                                             [> 100                                                            [=] 103                                                           
Corpus has been loaded: 103 sentences, 1005 tokens
Load corpus... 


734: _dataset/rucoref/rucoref_texts/libru/kolobkowa.txt
Load corpus
[=] 88                                                           
Corpus has been loaded: 88 sentences, 744 tokens
Processing corpus
  0%|          | 0/88 [00:00<?, ?it/s]

done.
Preprocess corpus
[> 0                                                             [=] 88                                                           
Corpus has been processed: 1 documents, 28 paragraphs, 88 sentences, 744 tokens
Save corpus
[> 0                                                             [=] 88                                                           
Corpus has been saved


100%|██████████| 88/88 [00:00<00:00, 418.15it/s]
Processing corpus
100%|██████████| 88/88 [00:00<00:00, 401.17it/s]

Load corpus
[> 0                                                             [=] 88                                                           
Corpus has been loaded: 88 sentences, 744 tokens
Save corpus
[> 0                                                             [=] 88                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 88                                                           
Corpus has been loaded: 88 sentences, 744 tokens
Load corpus... 



735: _dataset/rucoref/rucoref_texts/libru/lyalin2.txt
Load corpus
[=] 44                                                           
Corpus has been loaded: 44 sentences, 517 tokens
Processing corpus
100%|██████████| 44/44 [00:00<00:00, 343028.58it/s]


done.
Preprocess corpus
[> 0                                                             [=] 44                                                           
Corpus has been processed: 1 documents, 16 paragraphs, 44 sentences, 517 tokens
Save corpus
[> 0                                                             [=] 44                                                           
Corpus has been saved


Processing corpus
100%|██████████| 44/44 [00:00<00:00, 158683.90it/s]


Load corpus
[> 0                                                             [=] 44                                                           
Corpus has been loaded: 44 sentences, 517 tokens
Save corpus
[> 0                                                             [=] 44                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 44                                                           
Corpus has been loaded: 44 sentences, 517 tokens
Load corpus... 


736: _dataset/rucoref/rucoref_texts/libru/boldyrewae.txt
Load corpus
[=] 16                                                           
Corpus has been loaded: 16 sentences, 362 tokens
Processing corpus
100%|██████████| 16/16 [00:00<00:00, 147168.56it/s]
Processing corpus
100%|██████████| 16/16 [00:00<00:00, 169895.86it/s]

done.
Preprocess corpus
[> 0                                                             [=] 16                                                           
Corpus has been processed: 1 documents, 7 paragraphs, 16 sentences, 362 tokens
Save corpus
[> 0                                                             [=] 16                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 16                                                           
Corpus has been loaded: 16 sentences, 362 tokens
Save corpus
[> 0                                                             [=] 16                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 16                                                           
Corpus has been loaded: 16 sentences, 362 tokens
Load corpus... 


737: _dataset/rucoref/rucoref_texts/libru/maslov.txt
Load corpus
[=] 29                                                           
Corpus has been loaded: 29 sentences, 360 tokens
Processing corpus
100%|██████████| 29/29 [00:00<00:00, 75083.22it/s]
Processing corpus
100%|██████████| 29/29 [00:00<00:00, 79292.58it/s]

done.
Preprocess corpus
[> 0                                                             [=] 29                                                           
Corpus has been processed: 1 documents, 4 paragraphs, 29 sentences, 360 tokens
Save corpus
[> 0                                                             [=] 29                                                           
Corpus has been saved





Load corpus
[> 0                                                             [=] 29                                                           
Corpus has been loaded: 29 sentences, 360 tokens
Save corpus
[> 0                                                             [=] 29                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 29                                                           
Corpus has been loaded: 29 sentences, 360 tokens
Load corpus... 


738: _dataset/rucoref/rucoref_texts/libru/ugrumowa.txt
Load corpus
[=] 41                                                           
Corpus has been loaded: 41 sentences, 724 tokens
Processing corpus
100%|██████████| 41/41 [00:00<00:00, 105371.61it/s]


done.
Preprocess corpus
[> 0                                                             [=] 41                                                           
Corpus has been processed: 1 documents, 12 paragraphs, 41 sentences, 724 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved


Processing corpus
100%|██████████| 41/41 [00:00<00:00, 360516.70it/s]


Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 724 tokens
Save corpus
[> 0                                                             [=] 41                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 41                                                           
Corpus has been loaded: 41 sentences, 724 tokens
Load corpus... 

token 37: Санкт-Петербург != Санкт
token 37: Санкт-Петербург != Санкт
token 42 (-) is not found
token 42 (-) is not found
token 43 (Петербург) is not found
token 43 (Петербург) is not found
token 1175: Санкт-Петербурге != Санкт
token 1175: Санкт-Петербурге != Санкт
token 1180 (-) is not found
token 1180 (-) is not found
token 1181 (Петербурге) is not found
token 1181 (Петербурге) is not found
token 2241: Санкт-Петербург != Санкт
token 2241: Санкт-Петербург != Санкт
token 2246 (-) is not found
token 2246 (-) is not found
token 2247 (Петербург) is not found
token 2247 (Петербург) is not found

739: _dataset/rucoref/rucoref_texts/libru/yaglov.txt
Load corpus
[=] 49                                                           
Corpus has been loaded: 49 sentences, 519 tokens
Processing corpus
100%|██████████| 49/49 [00:00<00:00, 193887.64it/s]

done.
Preprocess corpus
[> 0                                                             [=] 49                                                           
Corpus has been processed: 1 documents, 26 paragraphs, 49 sentences, 519 tokens
Save corpus
[> 0                                                             [=] 49                                                           
Corpus has been saved



Processing corpus
100%|██████████| 49/49 [00:00<00:00, 301350.29it/s]



Load corpus
[> 0                                                             [=] 49                                                           
Corpus has been loaded: 49 sentences, 519 tokens
Save corpus
[> 0                                                             [=] 49                                                           
Corpus has been saved
Load corpus
[> 0                                                             [=] 49                                                           
Corpus has been loaded: 49 sentences, 519 tokens
