In [1]:
#Razdel — сегментация текста на предложения и токены;
#Navec — качественный компактные эмбеддинги;
#Slovnet — современные компактные модели для морфологии, синтаксиса, NER;
#Yargy — правила и словари для извлечения структурированной информации;
#Ipymarkup — визуализация NER и синтаксической разметки;
!pip install natasha
!pip install natsort
!pip install navec

import os
import sys
import pickle
import re
from natsort import natsorted

import ipymarkup
from natasha import (
    Segmenter,
    MorphVocab,   
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger, 
    PER,
    NamesExtractor,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(morph_vocab)



In [2]:
def get_filepaths(d):
    # List which will store all of the full filepaths.
    locations = []
    # os.walk to get tree info
    for root, directories, files in os.walk(d):
        for filename in files:
            if filename.endswith(".pkl") and filename.startswith('ch'):
            # join the strings to form filepath
                filepath = os.path.join(root, filename)
                locations.append(filepath)
    # uses natsort to return human-intelligible ordering
    return natsorted(locations)

# store function results in a variable.   
sorted_filepaths = get_filepaths('/Users/Peter/Documents/GitHub/DSAM2019/')    

In [3]:
wholebook = []
for f in sorted_filepaths:
    with open(str(f), 'rb') as f:
        data = pickle.load(f)
        ch = ''.join(data)
        wholebook.append(ch)
        f.close()
    
for f in wholebook:
    doc = Doc(f)
    # divides doc into tokens and sents, given start and stop properties
    doc.segment(segmenter)
    # every token is morphologically tagged, given pos and feats properties
    doc.tag_morph(morph_tagger)
    # named entity recognition
    doc.tag_ner(ner_tagger)
    #doc.sents[0].morph.print()
    print(doc.tokens[0])
    #print(doc.sents[0])
    #print("\n")

'''
for span in doc.spans:
    if span.type == PER:
        span.extract_fact(names_extractor)

for span in doc.spans:
    span.normalize(morph_vocab)
    
for span in doc.spans:
    if span.type == PER:
        span.extract_fact(names_extractor)
        
dates_extractor, money_extractor and addr_extractor
'''

DocToken(stop=6, text='Москва', pos='PROPN', feats=<Inan,Nom,Fem,Sing>)
DocToken(stop=1, text='В', pos='ADP')
DocToken(stop=1, text='–', pos='PUNCT')
DocToken(stop=6, text='Утихли', pos='VERB', feats=<Perf,Ind,Plur,Past,Fin,Mid>)
DocToken(stop=9, text='Старинный', pos='ADJ', feats=<Nom,Pos,Masc,Sing>)
DocToken(stop=5, text='Когда', pos='SCONJ')
DocToken(stop=4, text='Если', pos='SCONJ')
DocToken(stop=3, text='Как', pos='SCONJ')
DocToken(stop=7, text='Никанор', pos='PROPN', feats=<Anim,Nom,Masc,Sing>)
DocToken(stop=1, text='В', pos='ADP')
DocToken(stop=3, text='Бор', pos='PROPN', feats=<Inan,Acc,Masc,Sing>)
DocToken(stop=9, text='Маленький', pos='ADJ', feats=<Nom,Pos,Masc,Sing>)
DocToken(stop=4, text='Итак', pos='ADV', feats=<Pos>)
DocToken(stop=2, text='Не', pos='PART', feats=<Neg>)
DocToken(stop=8, text='Нетрудно', pos='PRON', feats=<Gen>)
DocToken(stop=6, text='Солнце', pos='PROPN', feats=<Inan,Nom,Neut,Sing>)
DocToken(stop=5, text='Утром', pos='NOUN', feats=<Inan,Ins,Neut,Sing>)
Doc

'\nfor span in doc.spans:\n    if span.type == PER:\n        span.extract_fact(names_extractor)\n\nfor span in doc.spans:\n    span.normalize(morph_vocab)\n    \nfor span in doc.spans:\n    if span.type == PER:\n        span.extract_fact(names_extractor)\n        \ndates_extractor, money_extractor and addr_extractor\n'

In [4]:
for f in wholebook:
    doc = Doc(f)   
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    # lemmatizer
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    print(doc.tokens[0])
    #print("\n")
    
'''
syntax.print() to visualize syntax markup

doc.parse_syntax(syntax_parser)
'''

DocToken(stop=6, text='Москва', pos='PROPN', feats=<Inan,Nom,Fem,Sing>, lemma='москва')
DocToken(stop=1, text='В', pos='ADP', lemma='в')
DocToken(stop=1, text='–', pos='PUNCT', lemma='–')
DocToken(stop=6, text='Утихли', pos='VERB', feats=<Perf,Ind,Plur,Past,Fin,Mid>, lemma='утихнуть')
DocToken(stop=9, text='Старинный', pos='ADJ', feats=<Nom,Pos,Masc,Sing>, lemma='старинный')
DocToken(stop=5, text='Когда', pos='SCONJ', lemma='когда')
DocToken(stop=4, text='Если', pos='SCONJ', lemma='если')
DocToken(stop=3, text='Как', pos='SCONJ', lemma='как')
DocToken(stop=7, text='Никанор', pos='PROPN', feats=<Anim,Nom,Masc,Sing>, lemma='никанор')
DocToken(stop=1, text='В', pos='ADP', lemma='в')
DocToken(stop=3, text='Бор', pos='PROPN', feats=<Inan,Acc,Masc,Sing>, lemma='бор')
DocToken(stop=9, text='Маленький', pos='ADJ', feats=<Nom,Pos,Masc,Sing>, lemma='маленький')
DocToken(stop=4, text='Итак', pos='ADV', feats=<Pos>, lemma='итак')
DocToken(stop=2, text='Не', pos='PART', feats=<Neg>, lemma='не')
Doc

'\nsyntax.print() to visualize syntax markup\n\ndoc.parse_syntax(syntax_parser)\n'