In [1]:
import pandas as pd
import re
import pymorphy2
import razdel
# import tokenization
from stop_words import get_stop_words
import json

#### Tokenization

In [2]:
import re
import six
import razdel


ACCENT = six.unichr(769)
WORD_TOKENIZATION_RULES = re.compile(r"""
[\w""" + ACCENT + """]+://(?:[a-zA-Z]|[0-9]|[$-_@.&+])+
|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+
|[0-9]+-[а-яА-ЯіїІЇєЄґҐ'’`""" + ACCENT + """]+
|[+-]?[0-9](?:[0-9,.-]*[0-9])?
|[\w""" + ACCENT + """](?:[\w'’`-""" + ACCENT + """]?[\w""" + ACCENT + """]+)*
|[\w""" + ACCENT + """].(?:\[\w""" + ACCENT + """].)+[\w""" + ACCENT + """]?
|["#$%&*+,/:;<=>@^`~…\\(\\)⟨⟩{}\[\|\]‒–—―«»“”‘’'№]
|[.!?]+
|-+
""", re.X | re.U)


ABBRS = """
ім.
в.
о.
т.
п.
д.
под.
ін.
вул.
просп.
бул.
пров.
пл.
г.
р.
див.
п.
с.
м.
н.
е.
адмін.
к.
геогр.
обл.
смт.
авт.
адм.
акад.
англ.
арк.
арт.
археол.
арх.
архіт.
асист.
асоц.
б.
буд.
бух.
бюдж.
вет.
вид.
викл.
відкр.
дип.
діагр.
екол.
екон.
євр.
журн.
зобр.
іл.
інв.
інд.
інж.
іст.
каф.
кл.
коеф.
лаб.
лінгв.
літ.
мат.
мед.
мех.
міс.
муз.
нар.
нац.
орг.
офіц.
пед.
пр.
проф.
публ.
рис.
мал.
pp.
рос.
св.
сл.
ст.
студ.
табл.
тис.
укр.
упр.
фіз.
фін.
ц.
""".strip().split()


def tokenize_sents(string):
    string = six.text_type(string)
    spans = []
    for match in re.finditer('[^\s]+', string):
        spans.append(match)
    spans_count = len(spans)

    rez = []
    off = 0

    for i in range(spans_count):
        tok = string[spans[i].start():spans[i].end()]
        if i == spans_count - 1:
            rez.append(string[off:spans[i].end()])
        elif tok[-1] in ['.', '!', '?', '…', '»', "'", "\""]:
            # tok1 = tok[re.search('[.!?…»]', tok).start() - 1]
            next_tok = string[spans[i + 1].start():spans[i + 1].end()]
            if (next_tok[0].isupper() or next_tok[0] in ["'", "\"", "«"]) \
                    and not ((len(tok) == 2 and tok[0].isupper()) \
                             or tok[0] == '('
                             or tok in ABBRS):
                rez.append(string[off:spans[i].end()])
                off = spans[i + 1].start()

    return rez


def text_to_sent(text, lang):
    rez = []
    if lang == 'uk':
        for part in text.split('\n'):
            rez += tokenize_sents(part)
    elif lang=='ru':
        for part in text.split('\n'):
            rez += [s.text for s in razdel.sentenize(part)]
    return rez


def sent_to_words(text, lang):
    if lang == 'uk':
        return re.findall(WORD_TOKENIZATION_RULES, text)
    elif lang == 'ru':
        return [tkn.text for tkn in razdel.tokenize(text)]
    return None


def tokenize(text, lang):
    res = []
    for sent in text_to_sent(text, lang):
        tokens = []
        for word in sent_to_words(sent, lang):
            tokens.append(word)
        res.append(tokens)
    return res

#### Names search

In [3]:
news_filepath = '../data/april.csv'
ukr_tone_filepath = 'dicts/EmotionLookupTable_ukr.txt'
ru_tone_filepath = 'dicts/EmotionLookupTable_ru.txt'

ukr_politician_dict_path = 'dicts/politicians_ukr.csv'
ru_politician_dict_path = 'dicts/politicians_ru.csv'

ukr_names_filepath = 'dicts/names_ukr.txt'
ru_names_filepath = 'dicts/names_ru.txt'

tone_changers_filepath = 'dicts/tone_changers.json'

In [4]:
stop_words_ukr = get_stop_words('uk')
stop_words_ru = get_stop_words('ru')

In [2]:
news = pd.read_csv(news_filepath)
news['all_text'] = news.title.str.cat(news.text, sep='\n', na_rep = '')
news['all_text'] = news.all_text.str.strip()
news.columns

In [6]:
politicians_dict_ukr = pd.read_csv(ukr_politician_dict_path, sep=';')
politicians_dict_ru = pd.read_csv(ru_politician_dict_path, sep=';')
tone_dict_ukr = pd.read_csv(ukr_tone_filepath, sep='\t', header=None, names=['word', 'tone'])
tone_dict_ru = pd.read_csv(ru_tone_filepath, sep='\t', header=None, names=['word', 'tone'])
morph_ukr = pymorphy2.MorphAnalyzer(lang='uk')
morph_ru = pymorphy2.MorphAnalyzer(lang='ru')


In [7]:
with open(tone_changers_filepath) as f:
    tone_changers=f.read()

tone_changers = json.loads(tone_changers)
negations_ukr = tone_changers['ukr']['negation']
intensifiers_ukr = tone_changers['ukr']['intensifier']
negations_ru = tone_changers['ru']['negation']
intensifiers_ru = tone_changers['ru']['intensifier']

In [11]:
# tokenize text into sentences
def get_text_in_sents(news):
    news = news[news.text.notna()]
    news = news[(news.language=='uk')|(news.language=='ru')]
    news['text'] = news.apply(lambda row: text_to_sent(row.text, row.language), axis=1)
    return news  

In [12]:
# tokenize sentences to words
def get_tokenized_sentences(news):
    news['tokenized'] = news.apply(lambda row: tokenize(row.all_text, row.language), axis=1)
    return news

In [13]:
%%time
news['sentenized'] = news.apply(lambda row: text_to_sent(row.all_text, row.language), axis=1)
# news = get_tokenized_sentences(news)

CPU times: user 1min 16s, sys: 2.71 s, total: 1min 18s
Wall time: 1min 22s


In [14]:
# Forming different name variations to search in text:
# name + surname
# surname + name + patronim
# name + patronim + surname
# surname + name
# surname 
# and list of wrong names preceding surname    
def form_name_pattern(politician, ending, all_names):

    names_forms = []
    names_forms.append(rf"({politician['name']} {politician['surname']})")


    if pd.notnull(politician.patronim):
        names_forms.append(rf"({politician['surname']} {politician['name']}(?: {politician['patronim']})?)")
        names_forms.append(rf"((?:{politician['surname']} )?{politician['name']} {politician['patronim']}(?: {politician['surname']})?)")
    else:
        names_forms.append(rf"({politician['surname']} {politician['name']})")
    names_forms.append(rf"({politician['surname']})")
    wrong_names = [n for n in all_names if n != politician['shortname']] 
    wrong_names = '|'.join(wrong_names)
    wrong_names_forms = rf"((?:{wrong_names}){ending} {politician['surname']})"

    return '|'.join(names_forms), wrong_names_forms

In [15]:
# add endings to stemmed names to cover possible cases
ending_ukr = r"[аийуяіеоюєїьмфцвскпнртгшлдщзбжх]{,3}\b"
ending_ru = r"[аийуыяеоюёьэмфцвскпнртгшлдщзбжх]{,3}\b"


def add_endings(name_dict, ending, all_names):
    name_forms_dict = name_dict[['surname','name','patronim']].applymap(lambda x: x+ending if pd.notnull(x) else None)
    name_forms_dict['fullname'] = name_dict.fullname
    name_forms_dict['shortname'] = name_dict['name']
    name_forms_dict['name_forms'], name_forms_dict['wrong_names'] = zip(*name_forms_dict.apply(form_name_pattern, 
                                                                                               ending=ending, 
                                                                                               all_names=all_names, axis=1))
    return name_forms_dict

In [16]:
with open(ukr_names_filepath) as f:
    all_names_ukr = [n.strip() for n in f.readlines()]

with open(ru_names_filepath) as f:
    all_names_ru = [n.strip() for n in f.readlines()]

    
name_forms_ukr = add_endings(politicians_dict_ukr, ending_ukr, all_names_ukr)
name_forms_ru = add_endings(politicians_dict_ru, ending_ru, all_names_ru)

In [17]:
# search for names in sentence and return result as a string
def name_search_in_sentence(sentence, name_forms_dict):
    ments = name_forms_dict.name_forms.apply(lambda x: len(re.findall(x, sentence)))
    part = name_forms_dict[ments > 0]
    wrongs = part.wrong_names.apply(lambda x: len(re.findall(x, sentence)))
    return ';'.join(list(part.fullname.str.cat((ments - wrongs).astype(str), '+')))


# concatenate sentence results into one string for whole text
def name_search_in_text(row, name_forms_ukr = name_forms_ukr, name_froms_ru = name_forms_ru):
    if row.language == 'uk':
        mentions = [name_search_in_sentence(sent, name_forms_ukr) for sent in row.sentenized]
    else:    
        mentions = [name_search_in_sentence(sent, name_froms_ru) for sent in row.sentenized]
    if any(mentions):
        return '§'.join(mentions)
    return None
            

In [18]:
news['mentions'] = news.apply(name_search_in_text, axis=1)

In [4]:
# save mentions just in case
news[['link', 'id','mentions']].to_csv('april_mentions.csv', index=False)

In [None]:
news[['title', 'text', 'subtitle', 'link', 'domain', 'datetime', 'views',
       'created_at', 'category', 'language', 'id', 'domain_alias',
       'mycategory', 'mentions']].to_csv('../data/april.csv', index=False)

### Tonal words search

##### Ця частина вже зайва, бо емоційне забарвлення визначаємо окремою моделлю

In [31]:
# changing token tonality if it's negated or intensified
def change_tone(tokens, ind, tone, morph, negations, intensifiers):
    if ind > 0:
        if tokens[ind-1] in negations or\
        (ind > 1 and tokens[ind-2] == 'не' and morph.parse(tokens[ind-1])[0].normal_form in negations[-1]):
            return tone*(-1)
       
        if morph.parse(tokens[ind-1])[0].normal_form in intensifiers:
            if tone > 0:
                return tone+1
            else:
                return tone-1
    return tone


# search for non-neutral words in a sentence tokenized to list of words
def count_tone(tokens, morph, tone_dict, stop_words, negations, intensifiers):
    tone_list = [] 
    for i in range(len(tokens)):
        tok = tokens[i].lower()
        if (len(tok) > 2 and tok not in stop_words) or tok=='яд':
            tok = morph.parse(tok)[0].normal_form
            tone = tone_dict[tone_dict['word'] == tok].tone
            if not tone.empty:
                try:
                    tone = int(tone)
                    tone = change_tone(tokens, i, tone, morph, negations, intensifiers)
                    tone_list.append(tok + ':' + str(tone))
                except TypeError:
                    print(tone)
                    pass
                del tone
        else:
            del tok
    return tone_list

In [33]:
news['tokenized'] = news.apply(lambda row: [sent_to_words(sent, row.language) for sent in row.sentenized], 
                               axis=1)

In [34]:
# save all non-netrual words in text(with their tonality) as a string
# join tonal words in sentence by ';' and then join sentence results by '#'
def tone_by_text(row, ukr_tone=tone_dict_ukr, ru_tone=tone_dict_ru, ukr_stop=stop_words_ukr, ru_stop=stop_words_ru):
    if row.language == 'uk':
        return '#'.join([';'.join(count_tone(sent, morph_ukr, ukr_tone, ukr_stop, negations_ukr, intensifiers_ukr))
                         for sent in row.tokenized])
    else:
        return '#'.join([';'.join(count_tone(sent, morph_ru, ru_tone, ru_stop, negations_ru, intensifiers_ru)) 
                         for sent in row.tokenized])   

In [35]:
# apply your function to the part of dataframe and add result to file
def process_parts(news_df, func, out_file, start, stop, step):
    res_list = []
    for i in range(start, stop, step):
        res_part = news_df.iloc[i:i+step].apply(func, axis=1)
        res_list.append(res_part)
        res_part.to_csv(out_file, mode="a", header=False)
        del res_part
        
    return pd.concat(res_list)

In [36]:
%%time
news['tone_by_sents'] = news.apply(tone_by_text, axis=1)

CPU times: user 1min 37s, sys: 1.14 s, total: 1min 38s
Wall time: 1min 43s


In [22]:
news.columns

Index(['title', 'text', 'subtitle', 'link', 'domain', 'datetime', 'views',
       'created_at', 'category', 'language', 'id', 'domain_alias',
       'mycategory', 'all_text', 'sentenized', 'mentions'],
      dtype='object')

In [23]:
news[['title', 'text', 'subtitle', 'link', 'domain', 'datetime', 'views',
       'created_at', 'category', 'language', 'id', 'domain_alias',
       'mycategory', 'mentions']].to_csv('../data/april.csv', index=False)