### 3. Check Entities by dependecy trees

In [2]:
import pandas as pd
import stanza
import re
from spacy_conll import init_parser
from tokenization import *
from keywords import keywords_dict

#### Reading files

In [4]:
entities = pd.read_csv('../data/entities_may.csv', index_col=[0])
# entities = entities.set_index('link', drop=False)
entities.columns

Index(['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str'],
      dtype='object')

In [5]:
def names_and_kw_str_to_list(names_and_kw_str):
    if pd.notna(names_and_kw_str):
        res = []
        for ent in names_and_kw_str.split('<+>'):
            parts = ent.split('<#>')
            names = re.findall('(\(\d+\, \d+\))<§§>PERS<§§>(.*?)<§§>([\d\.]+)', parts[2])
            for n in names:
                r = [int(i) for i in n[0].strip('()').split(', ')]
                res.append( ( int(parts[0]), parts[1], r, n[1], float(n[2]) ) )
        return res
    return None

In [6]:
news = pd.read_csv('../data/may.csv', index_col=[0])
# news = news.set_index('link', drop=False)
news['filt_kw_names_str'] = entities['filt_kw_names_str']
news['kw_and_ent'] = entities.kw_and_ent
news.columns

Index(['title', 'text', 'subtitle', 'link', 'domain', 'datetime', 'views',
       'created_at', 'category', 'language', 'domain_alias', 'mycategory',
       'filt_kw_names_str', 'kw_and_ent'],
      dtype='object')

In [7]:
del entities

In [8]:
news['filt_kw_names'] = news.filt_kw_names_str.str.split('§')
news['entities'] = news.kw_and_ent.apply(names_and_kw_str_to_list)
# news['sentenized'] = news.sentences_joined.str.split('<§>')

news['checked_with_conllu'] = None

news['all_text'] = news.title.str.cat(news.text, sep='\n', na_rep = '')
news['all_text'] = news.all_text.str.strip()

In [9]:
%%time
news['sentenized'] = news.apply(lambda row: tokenize_to_sent_str(row.all_text, row.language), axis=1)

CPU times: user 4min 8s, sys: 5.03 s, total: 4min 13s
Wall time: 4min 35s


#### Loading stanza models

In [10]:
stanza.download('uk')
stanza.download('ru', package='gsd', processors='tokenize,pos,lemma,depparse')

nlp = stanza.Pipeline('uk', processors='tokenize,lemma')

nlp_uk = init_parser(
        "stanza",
        "uk", 
        is_tokenized = True,
        include_headers=False,
        parser_opts = {'processors': 'tokenize,pos,lemma,depparse'}
    )

nlp_ru = init_parser(
        "stanza",
        "ru", 
        is_tokenized = True,
        include_headers=False,
        parser_opts = {'package': 'gsd', 'processors': 'tokenize,pos,lemma,depparse'}
    )

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 15.1MB/s]                    
2021-07-10 18:04:00 INFO: Downloading default packages for language: uk (Ukrainian)...
2021-07-10 18:04:02 INFO: File exists: /Users/oksana/stanza_resources/uk/default.zip.
2021-07-10 18:04:06 INFO: Finished downloading models and saved to /Users/oksana/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 11.6MB/s]                    
2021-07-10 18:04:06 INFO: Downloading these customized packages for language: ru (Russian)...
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |
| pretrain  | gsd     |

2021-07-10 18:04:06 INFO: File exists: /Users/oksana/stanza_resources/ru/tokenize/gsd.pt.
2021-07-10 18:04:06 INFO: File exists: /Users/oksana/stanza_resources/ru/pos/gs

#### Checking sentences with keywords with dependency trees

In [12]:
all_keywords = '|'.join([r'\b' + kw for kw in keywords_dict])


without_check = [
    '(е|э)ксперт',
    'анал(і|и)тик',
    '(про)?цит(ує|ат|ував|увала?)',
]
according_to_keywords = [
    'за\s(словами|оцінк)',
    'на\sдумку',
    'по\sсловам',
    'по\s(оценк|мнен)',
    'згідно',
    'согласно',
    'посилання',
]
wrong_keywords = [
    'посила(вся|лася)',
]

without_check = '|'.join([r'\b' + kw for kw in without_check])
according_to_keywords = '|'.join([r'\b' + kw for kw in according_to_keywords])
wrong_keywords = '|'.join([r'\b' + kw for kw in wrong_keywords])

In [13]:
def check_nsubj_relation(conllu_df, ent_range, all_keywords):
    for i in ent_range:
        id = i+1
        curr_node = conllu_df.loc[id]
        if curr_node['upostag'] in ['NOUN', 'PROPN']:
            while curr_node['deprel'] != 'root':
                head_node = conllu_df.loc[curr_node['head']]
#                 print(curr_node.form, head_node.form)
                if curr_node['deprel'].startswith('nsubj'):
                    if re.match(all_keywords, head_node['form'], flags=re.I):
                        return True
                    else:
                        return False
                curr_node = head_node
    return False


def check_nmod_relation(conllu_df, ent_range, keyword):
    for i in ent_range:
        id = i+1
        curr_node = conllu_df.loc[id]
        if curr_node['upostag'] in ['NOUN', 'PROPN']:
            while curr_node['deprel'] != 'root':
                head_node = conllu_df.loc[curr_node['head']]
                if re.match(keyword, curr_node['form'], flags=re.I) or \
                    re.match(keyword, head_node['form'], flags=re.I):
                    return True
                curr_node = head_node
    return False



In [14]:
def sentence_check(entitites_list, sentences, filt_names_list, language):
    if isinstance(entitites_list, list) and isinstance(filt_names_list, list):
        res = []
        sent_num = -1
        for ent in entitites_list:
            if (ent[3] in filt_names_list):
                if re.search(wrong_keywords, ent[1], flags=re.I):
                    res.append(False)
                elif re.search(without_check, ent[1], flags=re.I):
                    res.append(True)
                else:
                    if sent_num != ent[0]:
                        sent_num = ent[0]
                        if language == 'uk':
                            doc = nlp_uk(sentences[sent_num])
                        else:
                            doc = nlp_ru(sentences[sent_num])
                        conllu_df = (doc._.conll_pd).set_index('id')
                    
                    if re.search(according_to_keywords, ent[1], flags=re.I):
                        keyword = ent[1].split()[-1]
                        res.append(check_nmod_relation(conllu_df, ent[2][:-1], keyword))
                    else:
                        res.append(check_nsubj_relation(conllu_df, ent[2][:-1], all_keywords))        
            else:
                res.append('NINL')
        try:
            del conllu_df
        except:
            pass
        
        return res
    return None

In [None]:
out_file = 'checked_with_conllu_may.csv'

In [22]:
# news = news[['entities', 'sentenized', 'filt_kw_names', 'language', 'link', 'checked_with_conllu']]

In [26]:
%%time
start = 0
end = len(news)
step = 200

for k in range(start, end, step):
    part = news.iloc[k:k+step].apply(lambda row: sentence_check(row.entities, row.sentenized, row.filt_kw_names, row.language), axis=1)

    news['checked_with_conllu'].update(part)
    news['checked_with_conllu'].to_csv(out_file)
    print(k, news['checked_with_conllu'].notna().sum())
    
    del part

100000 62290
100200 62408
100400 62515
100600 62629
100800 62744
101000 62878
101200 63014
101400 63137
101600 63259
101800 63373
102000 63503
102200 63636
102400 63761
102600 63900
102800 64034
103000 64175
103200 64290
103400 64404
103600 64547
103800 64690
104000 64814
104200 64945
104400 65088
104600 65176
104800 65282
105000 65392
105200 65486
105400 65596
105600 65695
105800 65811
106000 65922
106200 66028
106400 66133
106600 66255
106800 66371
107000 66477
107200 66567
107400 66675
107600 66778
107800 66863
108000 66950
108200 67047
108400 67136
108600 67235
108800 67322
109000 67416
109200 67514
109400 67605
109600 67711
109800 67812
110000 67904
110200 67991
110400 68073
110600 68170
110800 68257
111000 68364
111200 68447
111400 68519
111600 68598
111800 68689
112000 68783
112200 68871
112400 68949
112600 69035
112800 69118
113000 69246
113200 69384
113400 69510
113600 69652
113800 69771
114000 69896
114200 70031
114400 70180
114600 70304
114800 70453
115000 70606
115200 70724

In [132]:
news[['link', 'checked_with_conllu']].to_csv('conllu_checked_may.csv', index=False)

In [4]:
# s = "Головна економістка інвестиційної компанії Dragon CapitaІ Олена Білан сказала виданню, що у грудні уряду необхідно буде залучити приблизно $3 млрд, щоб профінансувати дефіцит бюджету, не урізавши критичних витрат."
# doc = nlp_uk(s)
# conll = doc._.conll_pd
# print(conll)