In [6]:
from IPython.display import display
from natasha.grammars.name import NAME
from yargy.pipelines import morph_pipeline
from yargy.relations import gnc_relation
# pip install ipymarkup
from ipymarkup import show_span_ascii_markup as show_markup

from yargy import (
    Parser,
    or_,
    not_,
    rule
)
from yargy.pipelines import morph_pipeline
from yargy.predicates import (
    eq, in_, dictionary,
    type, gram, is_capitalized, is_upper
)
from yargy.tokenizer import MorphTokenizer
from yargy import interpretation as interp
from yargy.interpretation import fact, attribute

from yargy.tokenizer import EMAIL_RULE, PHONE_RULE

gnc = gnc_relation()

tokenizer = MorphTokenizer().add_rules(EMAIL_RULE, PHONE_RULE)

#Defining basic rules for grammemes and digits
INT = rule(type('INT'))
NOUN = gram('NOUN')
ADJF = gram('ADJF')
PRTF = gram('PRTF')
NUMR = gram('NUMR')
GENT = gram('gent')
DOT = eq('.')
GEOX = rule(gram('Geox'))

In [4]:
from yargy.record import Record
class Synonyms(Record):
    __attributes__ = ['name', 'synonyms']
    
    def __init__(self, name, synonyms=()):
        self.name = name
        self.synonyms = synonyms

Organization = fact('Organization', [attribute('subdivision').repeatable(), 'name'])
ORG_NAMES = [
    Synonyms('Министерство обороны', ['МО', 'минобороны']),
    Synonyms('Министерство внутренних дел', ['МВД', 'полиция']),
    Synonyms('Министерство зравоохранения', ['минздрав']),
    Synonyms('Министерство науки и высшего образования Российской Федерации', ['минобрнауки'])
]

org_names = []
mapping = {}
for record in ORG_NAMES:
    name = record.name
    org_names.append(name)
    mapping[name] = name
    for synonym in record.synonyms:
        org_names.append(synonym)
        mapping[synonym] = name

In [10]:
#function to show matches for rules and facts
def show_matches(rule, *lines):
    parser = Parser(rule, tokenizer)
    for line in lines:
        matches = parser.findall(line)
        matches = sorted(matches, key=lambda _: _.span)
        spans = [_.span for _ in matches]
        show_markup(line, spans)
        if matches:
            facts = [_.fact for _ in matches]
            if len(facts) == 1:
                facts = facts[0]
            display(facts)

In [None]:
Organization = fact(
    'Organization',
    ['unit', 'org_name']
)

ORG_NAMES = [
    Synonyms('Министерство обороны', ['МО', 'минобороны']),
    Synonyms('Министерство внутренних дел', ['МВД', 'полиция']),
    Synonyms('Министерство зравоохранения', ['минздрав']),
    Synonyms('Министерство науки и высшего образования Российской Федерации', ['минобрнауки'])
]

org_names = []
mapping = {}
for record in ORG_NAMES:
    name = record.name
    org_names.append(name)
    mapping[name] = name
    for synonym in record.synonyms:
        org_names.append(synonym)
        mapping[synonym] = name
        
ORG_NAME = morph_pipeline(org_names).interpretation(
    Organization.org_name.normalized()
)


Modifier = fact('Modifier', ['value'])
Subdivision = fact('Subdivision', ['modifier', 'subdiv_type'])

SUBDIVISION_TYPE = morph_pipeline([
    'управление',
    'отдел',
    'отделение',
    'служба',
    'центр',
    'департамент',
    'агентство',
    'сектор',
    'участок',
    'лаборатория'
]).interpretation(Subdivision.subdiv_type.normalized())

Number = fact(
    'Number',
    ['value']
)

NUMBER = rule(
    or_(INT, rule(NUMR)).interpretation(Number.value)
).interpretation(Number)

Adjs = fact(
    'Adjs',
    [attribute('parts').repeatable()]
)

ADJ = or_(
    ADJF,
    PRTF,
).interpretation(
    interp.normalized()
).interpretation(
    Adjs.parts
)

ADJS = ADJ.repeatable(max=3).interpretation(
    Adjs
)


MODIFIER = rule(or_(
    ADJS,
    NUMBER
).interpretation(
    Modifier.value
)).interpretation(Subdivision.modifier)

SUBDIVISION = or_(rule(MODIFIER, SUBDIVISION_TYPE),
                  SUBDIVISION_TYPE
                 ).interpretation(Subdivision)

Unit = fact('Unit', [attribute('parts').repeatable()])

UNIT = SUBDIVISION.interpretation(Unit.parts).repeatable().interpretation(Unit)
UNIT = UNIT.interpretation(Organization.unit)
ORGANIZATION = or_(
    rule(UNIT, ORG_NAME),
    rule(ORG_NAME)
).interpretation(
    Organization
)

In [12]:
#sample text
text = 'сотрудник полиции, 23 отдела первого отделения четвертого управления пятого центра МО Грузии, 1 лаборатории министерства обороны Грузии'

In [13]:
show_matches(ORGANIZATION, "1 отделения второго отдела Министерства обороны, 5 управления полиции")
parser = Parser(ORGANIZATION, tokenizer)
for match in parser.findall(text):
    print('Result:')
    print(match.fact)

1 отделения второго отдела Министерства обороны, 5 управления полиции
───────────────────────────────────────────────  ────────────────────


[Organization(
     unit=Unit(
         parts=[Subdivision(
              modifier=Number(
                  value='1'
              ),
              subdiv_type='отделение'
          ),
          Subdivision(
              modifier=Adjs(
                  parts=['второй']
              ),
              subdiv_type='отдел'
          )]
     ),
     org_name='Министерство обороны'
 ),
 Organization(
     unit=Unit(
         parts=[Subdivision(
              modifier=Number(
                  value='5'
              ),
              subdiv_type='управление'
          )]
     ),
     org_name='полиция'
 )]

Result:
Organization(unit=None, org_name='полиция')
Result:
Organization(unit=Unit(parts=[Subdivision(modifier=Number(value='23'), subdiv_type='отдел'), Subdivision(modifier=Adjs(parts=['первый']), subdiv_type='отделение'), Subdivision(modifier=Adjs(parts=['четвёртый']), subdiv_type='управление'), Subdivision(modifier=Adjs(parts=['пятый']), subdiv_type='центр')]), org_name='МО')
Result:
Organization(unit=Unit(parts=[Subdivision(modifier=Number(value='1'), subdiv_type='лаборатория')]), org_name='Министерство обороны')
