In [2]:
import json

import conllu
import fire
import os
import pandas as pd
from bs4 import BeautifulSoup
from isanlp_srl_framebank.pipeline_default import PipelineDefault
import re
import sys

from isanlp.processor_remote import ProcessorRemote
from isanlp.processor_syntaxnet_remote import ProcessorSyntaxNetRemote
from isanlp import PipelineCommon
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd
from isanlp_srl_framebank.processor_srl_framebank import ProcessorSrlFramebank
import json
from pprint import pprint as print_
from collections import OrderedDict
import numpy as np
from numpy.random import RandomState
from tqdm import tqdm_notebook as tqdm

  from cryptography import utils, x509
[nltk_data] Downloading package punkt to /home/lkmfwe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lkmfwe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/lkmfwe/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/lkmfwe/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


In [3]:
WORKDIR = "../../../workdir2_nocoref"

In [4]:
random_seed = 41

roleset44 = {
 'содержание высказывания',
 'говорящий',
 'субъект социального отношения',
 'субъект психологического состояния',
 'содержание действия',
 'агенс',
 'тема',
 'конечная точка',
 'сфера',
 'контрагент',
 'субъект перемещения',
 'причина',
 'субъект поведения',
 'ситуация в фокусе',
 'исходный посессор',
 'субъект физиологической реакции',
 'адресат',
 'пациенс',
 'срок',
 'источник звука',
 'место',
 'признак',
 'потенциальная угроза',
 'субъект ментального состояния',
 'конечный посессор',
 'результат',
 'стимул',
 'субъект восприятия',
 'эффектор',
 'траектория',
 'содержание мысли',
 'пациенс перемещения',
 'каузатор',
 'предмет высказывания',
 'начальная точка',
 'способ',
 'пациенс социального отношения',
 'статус',
 'предмет мысли',
 'цель',
 'потенциальный пациенс',
 'контрагент социального отношения',
 'эталон',
 'признак действия'}

ARGUMENT_POSTAGS = {
        'NOUN',
        'PRON',
        'ADJ',
        'PROPN'
    }

def get_roles_pred(lemma, role_annot, part_id):
    ann_sent = role_annot[part_id]
    predicates = {}
    arguments = {}
    for event in ann_sent:
        predicate = {
            'lemma': lemma[part_id][event.pred[0]],
        }
        predicates[event.pred[0]] = predicate
        arguments[event.pred[0]] = []
        for arg in event.args:
            argument = {
                'tag': arg.tag,
                'lemma': lemma[part_id][arg.begin],
                'idx': arg.begin
            }
            arguments[event.pred[0]].append(argument)

    return predicates, arguments


def get_example(corpus, ex_number, part_id):
    words = []
    for obj in corpus[ex_number][1][part_id]:
        word = obj['form']
        for symbol in ':;,.!?':
            word = word.replace(' ' + symbol, symbol)
        words.append(word)

    if words:
        return ' '.join(words)
    else:
        return '_'


def get_roles_true(annot, corpus, ex_number, part_id):
    predicates = {}
    arguments = {}
    postags = [item for sublist in annot['postag'] for item in sublist]
    for i, obj in enumerate(corpus[ex_number][1][part_id]):
        if 'rank' in obj:
            if obj['rank'] == 'Предикат':
                predicate = {
                    'lemma': obj['lemma']
                }
                predicates[i] = predicate
            else:
                if 'lemma' not in obj:
                    argument = {
                        'lemma': obj['form'],
                        'tag': obj['rolepred1'],
                        'idx': i
                    }
                else:
                    argument = {
                        'lemma': obj['lemma'],
                        'tag': obj['rolepred1'],
                        'idx': i
                    }

                argument['postag'] = postags[argument['idx']]
                pred_id = obj['fillpred']
                if pred_id not in arguments.keys():
                    arguments[pred_id] = []
                arguments[pred_id].append(argument)

    return predicates, arguments

def random_texts(corpus, ppl, n_samples=100):
    if len(corpus) > n_samples:
        np.random.seed(random_seed)
        samples_idxs = np.random.choice(len(corpus), size=n_samples)
    else:
        samples_idxs = [_ for _ in range(len(corpus))]

    texts = [get_example(corpus, ex_num, 0) for ex_num in samples_idxs]
    return texts

def random_predictions(corpus, ppl, n_samples=100):
    if len(corpus) > n_samples:
        np.random.seed(random_seed)
        samples_idxs = np.random.choice(len(corpus), size=n_samples)
    else:
        samples_idxs = [_ for _ in range(len(corpus))]

    texts = [get_example(corpus, ex_num, 0) for ex_num in samples_idxs]

    annotations = [ppl(text) for text in tqdm(texts, desc='Analyzing texts')]
    pred_roles = [get_roles_pred(res['lemma'], res['srl'], 0) for res in annotations]

    true_roles = [get_roles_true(annotations[i], corpus, ex_num, 0) for i, ex_num in enumerate(samples_idxs)]


    repl_roles = {
        'агенс - субъект восприятия' : 'субъект восприятия',
        'агенс - субъект ментального состояния' : 'субъект ментального состояния',
        'результат / цель' : 'результат',
        'место - пациенс' : 'место',
        'говорящий - субъект психологического состояния' : 'субъект психологического состояния'
    }

    for role, val in repl_roles.items():
        for pair in true_roles:
            for _, args in pair[1].items():
                for arg in args:
                    arg['tag'] = arg['tag'].replace(role, val)

    return true_roles, pred_roles, texts


def compute_metrics(y_pred, y_true, report_to=sys.stdout, roleset=roleset44, idxmatching=False):
    true_positive = 0
    condition_positive = 0
    predicted_condition_positive = 0
    error_examples = []

    print_func = lambda x: print(x, file=report_to)

    for i, (true_predicates, true_arguments) in enumerate(y_true):
        print_func(f"Inspecting example {i}")
        print_func(f"Expecting true predicates {true_predicates}")
        print_func(f"Expecting true arguments  {true_arguments}")

        pred_predicates, pred_arguments = y_pred[i]

        print_func(f"Got predicted predicates  {pred_predicates}")
        print_func(f"Got predicted arguments   {pred_arguments}")

        print_func("-"*60)

        for true_pred_idx, true_predicate in true_predicates.items():
            if true_pred_idx in pred_predicates:
                print_func(f"Matched predicate {true_pred_idx} = {true_predicate}")

                true = true_arguments[true_pred_idx]
                pred_arguments_i = pred_arguments[true_pred_idx]

                true_arguments_i = []

                for idx, true_argument in enumerate(true):
                    if true_argument['tag'] in roleset and true_argument['postag'] in ARGUMENT_POSTAGS:
                        true_arguments_i.append(true[idx])

                if true_arguments_i:
                    print_func(f"Expecting arguments  {true_arguments_i}")
                    print_func(f"Got predicted        {pred_arguments_i}")
                    print_func(f"Predicted Condition Positive = {len(pred_arguments_i)}")
                    print_func(f"Condition Positive           = {len(true_arguments_i)}")
                    condition_positive += len(true_arguments_i)
                    condition_positive_i = len(true_arguments_i)
                    predicted_condition_positive += len(pred_arguments_i)

                    true_positive_i = 0

                    error_report = {
                        'example_idx' : i,
                        'predicate': true_predicate,
                        'true_arguments' : true_arguments_i,
                        'predicted_arguments': pred_arguments_i
                    }

                    for j, obj in enumerate(true_arguments_i):
                        true_tag = obj['tag']
                        true_lemma = obj['lemma']
                        true_idx = obj['idx']
                        for obj_pred in pred_arguments_i:
                            if idxmatching:
                                if obj_pred['idx'] == true_idx:
                                    true_positive_i += 1
                            else:
                                if obj_pred['idx'] == true_idx and obj_pred['tag'] == true_tag:
                                    true_positive_i += 1

                    print_func(f"True Positive = {true_positive_i}")
                    if true_positive_i != condition_positive_i:
                        error_examples.append(error_report)

                    true_positive += true_positive_i

        print_func("="*60)

    recall = true_positive/condition_positive
    precision = true_positive/predicted_condition_positive

    return {
        'recall': recall,
        'precision': precision,
        'f1': 2 * ((precision*recall)/(precision+recall)),
        'errors': error_examples
    }

In [5]:
srl_proc = ProcessorSrlFramebank(WORKDIR, 'elmo')

def srl_ppl(tokens, postag, morph, lemma, syntax_dep_tree):
    srl_annot = srl_proc(tokens, postag, morph, lemma, syntax_dep_tree)
    return srl_annot

ppl = PipelineCommon([(ProcessorRemote('localhost', 3333, 'default'),
                           ['text'],
                           {'tokens': 'tokens',
                            'sentences': 'sentences',
                            'postag': 'mystem_postag',
                            'lemma': 'lemma'}),
                          (ProcessorSyntaxNetRemote('localhost', 3334),
                           ['tokens', 'sentences'],
                           {'syntax_dep_tree': 'syntax_dep_tree'}),
                          (ConverterMystemToUd(),
                           ['mystem_postag'],
                           {'morph': 'morph',
                            'postag': 'postag'}),
                          (srl_ppl,
                           ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
                           {'srl': 'srl'})])
def sprint_roles(lemma, role_annot):
    s = ""
    for sent_num, ann_sent in enumerate(role_annot):
        for event in ann_sent:
            s += '=====Pred: {}\n'.format(lemma[sent_num][event.pred[0]])
            for arg in event.args:
                s += 'Arg({}): {}\n'.format(arg.tag, lemma[sent_num][arg.begin])
    return s

def srl(text):
    annots = ppl(text)

    return annots, sprint_roles(annots['lemma'], annots['srl'])




INFO:tensorflow:Saver not created because there are no variables in the graph to restore



In [6]:
from pprint import pprint

In [1]:
annots, prettyprint = srl("Кошка грелась на солнце")

print(prettyprint)
print([ x for x in annots['srl']])

NameError: name 'srl' is not defined

In [7]:
corpus_path = f'{WORKDIR}/test_data.json'
with open(corpus_path, 'r', encoding='utf-8') as f:
    corpus = json.load(f)
len(corpus)

1000

In [7]:
res = ppl('- И за это тебя посадят, -- как бы сообразив, прервала его тётя Катя.')

for event in res['srl'][0]:
    print("Предикат:", res['lemma'][0][event.pred[0]])
    for i in range(len(event.args)):
        print(event.args[i].begin, f"({res['lemma'][0][event.args[i].begin]})", event.args[i].tag)

Предикат: посадить
3 (это) эталон
4 (ты) агенс
Предикат: прерывать
13 (он) пациенс
15 (катя) агенс


In [9]:
true_roles, pred_roles, tmp_texts = random_predictions(corpus, ppl, n_samples=len(corpus))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


Analyzing texts:   0%|          | 0/1000 [00:00<?, ?it/s]

In [10]:
log_path = 'log_idx.txt'
results = compute_metrics(y_pred=pred_roles, y_true=true_roles,
                          report_to=open(log_path, 'w', encoding='utf-8'),
                          idxmatching=True)

copyres = dict(results)
del copyres['errors']
print_(copyres)

{'f1': 0.7409470752089136,
 'precision': 0.751412429378531,
 'recall': 0.7307692307692307}
