In [1]:
import pandas as pd
import re
import dateparser
import datetime
import numpy as np
from collections import Counter
from translitua import translit

In [2]:
news_filepath = '../data/may.csv'
news = pd.read_csv(news_filepath, index_col=[0])
news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151053 entries, 2075213 to 2072557
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   title               151052 non-null  object 
 1   text                151037 non-null  object 
 2   subtitle            80390 non-null   object 
 3   link                151053 non-null  object 
 4   domain              151053 non-null  object 
 5   datetime            151053 non-null  object 
 6   views               61225 non-null   float64
 7   created_at          151053 non-null  object 
 8   category            114894 non-null  object 
 9   language            151053 non-null  object 
 10  domain_alias        151053 non-null  object 
 11  mycategory          151053 non-null  object 
 12  found_names_str     71364 non-null   object 
 13  politician_surname  151053 non-null  bool   
 14  mentions            33679 non-null   object 
 15  hash                151053 

In [11]:
news.mycategory.value_counts()

Політика               19763
Інциденти              18957
Коронавірус            15138
Закордонна політика    14403
Економіка              14081
Шоу-бізнес             13213
Світ                   13173
Суспільство            10189
Спорт                  10159
Війна/Донбас            8988
Життя/lifestyle         8803
Технології              5968
Карантин                4894
Погода                  2410
Інше                     500
Name: mycategory, dtype: int64

In [117]:
news['ment_tone'] = news.apply(lambda row: get_sent_tone(row.mentions, row.sent_list), axis=1)

In [18]:
news['tone_by_words'] = news.apply(lambda row: get_tone_sum(row.mentions, row.tone_words), axis=1)

In [8]:
def get_text_tone(tone):
    tones = re.findall(r'-?\d+', tone)
    return sum([int(t) for t in tones])

news['text_tone'] = news.tone_words.apply(get_text_tone)
# news['abs_tone'] = news.text_tone.abs()
# news['tone_words_num'] = news.tone_words.apply(lambda words: len(re.findall(r':', words)))

### Save to file

In [6]:
ment_num_dict = {}
for p in politicians:
    ment_num_dict[p] = len(news[news.mentions.str.contains(p, na=False)])
#     print(p, ':', len(news[news.mentions.str.contains(p, na=False)]))

In [7]:
ment_num_dict = {k:v for k,v in sorted(ment_num_dict.items(), key=lambda item: item[1], reverse=True)}

In [8]:
cat_dict = {'Економіка': 'economics',
            'Суспільство':'society',
            'Коронавірус': 'covid',
            'Інциденти': 'incidents',
            'Політика': 'politics',
            'Спорт': 'sport', 
            'Життя/lifestyle':'lifestyle',
            'Світ':'world',
            'Погода':'weather',
            'Технології':'technologies',
            'Шоу-бізнес':'showbiz',
            'Війна/Донбас':'war',
            'Карантин':'quarantine',
            'Закордонна політика':'affairs',
            'Інше':'other'}
top_categories = list(news.mycategory.value_counts().index[:5])
if 'Шоу-бізнес' in top_categories:
    top_categories.remove('Шоу-бізнес')
    top_categories.append(news.mycategory.value_counts().index[5])
print(top_categories)
top_str = '|'.join(top_categories)

['Інциденти', 'Політика', 'Світ', 'Економіка', 'Закордонна політика']


In [9]:
domains = list(news.domain_alias.unique())
keywords = ['позитивні', 'нейтральні', 'негативні']
cols=['image_name', 'name']
for d in domains:
    for k in keywords:
        cols.append(':'.join([d,k]))

In [11]:
def get_empty_df(cols=cols, top_names=top_names):
    df = pd.DataFrame(columns=cols)
    df['name'] = top_names
    for n in top_names:
        split_name = translit(n).lower().split()
        im_name = '_'.join([split_name[0][0], split_name[1]])
        df.loc[df['name']==n, 'image_name'] = im_name
    return df

In [12]:
def get_news_with_2_ments(news_part, name):
    part = news_part[news_part.mentions.str.contains(name, na=False)].copy()
    part = part.mentions.str.extractall(name + r'\+(?P<num>\d+)')
    part['num'] = part.num.astype(int)
    part = part.sum(level=[0])
    part = part[part.num>1]
    return part.index

### Sentiment by news with 2 mentions

Зберігаємо у файл на сайт кількість негативних, позитивних, нейтральних новин про політика на кожному сайті, якщо у новині він згадувався принаймні двічі

In [15]:
month = '05'
sentiment_column = 'sentiment'
top_news = news[news.mycategory.isin(top_categories)]
ment_num_dict = {}
ind_dict = {}

for p in politicians:
    ind_dict[p] = get_news_with_2_ments(top_news, p)
    ment_num_dict[p] = len(ind_dict[p])
    ment_num_dict = {k:v for k,v in sorted(ment_num_dict.items(), key=lambda item: item[1], reverse=True)}
    top_names = list(ment_num_dict.keys())[:10]
    df = get_empty_df(cols, top_names)
    
for n in top_names:
    for d in domains:
        tones = top_news[(top_news.index.isin(ind_dict[n]))&(top_news.domain_alias==d)][sentiment_column]

        df.loc[df['name']==n, ':'.join([d,keywords[0]])] = (tones>1).sum()
        df.loc[df['name']==n, ':'.join([d,keywords[1]])] = ((tones<=1)&(tones>=-1)).sum()
        df.loc[df['name']==n, ':'.join([d,keywords[2]])] = (tones<-1).sum()

    df.to_csv('datasets/{0}/politicians_all_{0}_2021.csv'.format(month), index=False, sep=';')

#### Те саме, тільки окремо для кожної з топ-5 категорій

In [25]:
for cat in top_categories:
    ment_num_dict = {}
    ind_dict = {}
    part = news[news.mycategory==cat]
    
    for p in politicians:
        ind_dict[p] = get_news_with_2_ments(part, p)
        ment_num_dict[p] = len(ind_dict[p])
    ment_num_dict = {k:v for k,v in sorted(ment_num_dict.items(), key=lambda item: item[1], reverse=True)}
    top_names = list(ment_num_dict.keys())[:10]
    df = get_empty_df(cols, top_names)
    for n in top_names:
        for d in domains:
            tones = part[(part.index.isin(ind_dict[n]))&(part.domain_alias==d)][sentiment_column]

            df.loc[df['name']==n, ':'.join([d,keywords[0]])] = (tones>1).sum()
            df.loc[df['name']==n, ':'.join([d,keywords[1]])] = ((tones<=1)&(tones>=-1)).sum()
            df.loc[df['name']==n, ':'.join([d,keywords[2]])] = (tones<-1).sum()
#     df['description'] = None
#     df['description'].iloc[0] = description        
    df.to_csv('datasets/{0}/politicians_{1}_{0}_2021.csv'.format(month, cat_dict[cat]), index=False, sep=';')

#### Sentiment by sentences-mentions

In [278]:
top_news = news[news.mycategory.isin(top_categories)]

for p in politicians:
    ment_num_dict[p] = len(top_news[top_news.mentions.str.contains(p, na=False)])
    ment_num_dict = {k:v for k,v in sorted(ment_num_dict.items(), key=lambda item: item[1], reverse=True)}
    top_names = list(ment_num_dict.keys())[:10]
    df = get_empty_df(cols, top_names)
    
for n in top_names:
    for d in domains:
        tones = top_news[(top_news.domain_alias==d)].ment_tone.str.extract(rf"(?:{n}\:)(-?[\d\.]+)", expand=False).astype(float)

        df.loc[df['name']==n, '; '.join([d,keywords[0]])] = (tones>0).sum()
        df.loc[df['name']==n, '; '.join([d,keywords[1]])] = (tones==0).sum()
        df.loc[df['name']==n, '; '.join([d,keywords[2]])] = (tones<0).sum()
        
    df.to_csv('politicians_all_03_2021.csv', index=False, sep=';')

In [279]:
for cat in top_categories:
    ment_num_dict = {}
    part = news[news.mycategory==cat]
    
    for p in politicians:
        ment_num_dict[p] = len(part[part.mentions.str.contains(p, na=False)])
    ment_num_dict = {k:v for k,v in sorted(ment_num_dict.items(), key=lambda item: item[1], reverse=True)}
    top_names = list(ment_num_dict.keys())[:10]
    df = get_empty_df(cols, top_names)
    for n in top_names:
        for d in domains:
            tones = part[(part.domain_alias==d)].ment_tone.str.extract(rf"(?:{n}\:)(-?[\d\.]+)", expand=False).astype(float)

            df.loc[df['name']==n, '; '.join([d,keywords[0]])] = (tones>0).sum()
            df.loc[df['name']==n, '; '.join([d,keywords[1]])] = (tones==0).sum()
            df.loc[df['name']==n, '; '.join([d,keywords[2]])] = (tones<0).sum()
            
    df.to_csv('politicians_'+cat_dict[cat]+'_03_2021.csv', index=False, sep=';')

### Get sentiment of politicians mentions
#### Old version - ignore

In [None]:
def dict_to_string(tone_dict):
    res = []
    for k, v in tone_dict.items():
        res.append(':'.join([k, str(v)]))
    return '#'.join(res)
    
def get_tone_sum(ment, tone):
    if pd.notnull(ment):
        ment = ment.split('§')
        tone = tone.split('#')
        tone_dict = {}
        for s in range(len(ment)):
            names = re.findall(r"\w+ \w+",  ment[s])
            if ment[s]:
                if tone[s]:
#                     print(ment[s])
#                     print(tone[s])
                    tone_num = sum([int(t.split(':')[-1]) for t in tone[s].split(';')])
                else:
                    tone_num = 0
                for n in names:
                    try:
                        tone_dict[n] += tone_num
                    except:
                        tone_dict[n] = tone_num
        return dict_to_string(tone_dict)
    return None

def get_sent_tone(ment, tone):
    if pd.notnull(ment):
        ment = ment.split('§')
        tone = tone.split(';')
#         if len(ment)==len(tone):
        tone_dict = {}
        for s in range(len(ment)):
            names = re.findall(r"\w+ \w+",  ment[s])
            if ment[s]:
                try:
                    if tone[s]:
                        tone_num = float(tone[s])
                    else:
                        tone_num = 0.0
                except:
                    tone_num = 0.0
                for n in names:
                    try:
                        tone_dict[n] += tone_num
                    except:
                        tone_dict[n] = tone_num
        return dict_to_string(tone_dict)
    return None

#### Get most frequent emotional words

In [181]:
def get_top_tone_words(name, month, tone, domain=''):
    if domain:
        words = news[(news.domain==domain)&(news[name]>1)&(news[name+'_tone']<tone)].tone_words.str.findall(r'[\w\'\-]+\:\-?\d', flags=re.I)
    else:
        words = news[(news[name]>1)&(news.index.month==month)&(news[name+'_tone']<tone)].tone_words.str.findall(r'[\w\'\-]+\:\-?\d', flags=re.I)
    
    res = []
    for w in words:
        res += w
    counted = Counter(res)
    return {k: v for k, v in sorted(counted.items(), key=lambda item: item[1], reverse=True)}

