In [1]:
import pandas as pd
import re
import unicodedata
import html2text
# from sqlalchemy import create_engine
from langdetect import detect
import dateparser
from news_cleaning_config import alias, strip_patterns, pub_type_patterns

In [None]:
news_filepath = 'data/summer.csv'

In [None]:
# reading file for the first time

news = pd.read_csv(news_filepath, sep=',', quotechar='"', escapechar="\\")
news = news.set_index('id')

In [2]:
news = pd.read_csv(news_filepath, index_col=0)
news['datetime'] = pd.to_datetime(news.datetime, utc=True).dt.tz_convert('Europe/Kiev')
news.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 477259 entries, 31443 to 33302
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   title         477254 non-null  object 
 1   text          477222 non-null  object 
 2   subtitle      245392 non-null  object 
 3   link          477259 non-null  object 
 4   domain        477259 non-null  object 
 5   datetime      477259 non-null  object 
 6   views         197102 non-null  float64
 7   created_at    477259 non-null  object 
 8   category      377096 non-null  object 
 9   language      477259 non-null  object 
 10  pub_type      477259 non-null  object 
 11  author        329206 non-null  object 
 12  tags          387336 non-null  object 
 13  source        59893 non-null   object 
 14  author_title  27698 non-null   object 
 15  domain_alias  477259 non-null  object 
dtypes: float64(1), object(15)
memory usage: 61.9+ MB


In [None]:
news['domain_alias'] = news.domain.apply(lambda x: alias[x])
news = news.drop_duplicates('link')

### Update espreso dates

In [None]:
# news['datetime'] = pd.to_datetime(news.datetime, utc=True).dt.tz_convert('Europe/Kiev')
date_string = news[news.domain=='espreso.tv'].text.str.extract(r'(^\d{1,2} \w+, 20\d{2}, \d{2}\:\d{2})').squeeze()
tmstmp = date_string.apply(lambda x: dateparser.parse(x, date_formats=['%d %B, %Y, %H:%M'], languages=['uk']))
tmstmp = tmstmp.dt.tz_localize('Europe/Kiev')
news.datetime = news.datetime.mask(news.domain=='espreso.tv', tmstmp)

#### Reading from database (old option)

In [84]:
def read_table(user='postgres', password='pgpass', db_name='media_ecosystem', table_name='december', chunksize=5000):
    db_url = 'postgresql://localhost/{}?user={}&password={}'.format(db_name, user, password)
    sql_engine = create_engine(db_url, echo=False)
    conn = sql_engine.connect()
    if chunksize:
        return pd.read_sql_table(table_name, conn, chunksize=chunksize)  
    else:
        return pd.read_sql_table(table_name, conn)  

    
def get_news(table_chunks, news_file):
    k=0
    df_parts = []
    text_maker = get_text_maker()
    for chunk in table_chunks:
        chunk['datetime'] = pd.to_datetime(chunk.datetime, utc=True).dt.tz_convert('Europe/Kiev')
#         chunk = chunk[chunk.datetime.dt.month>8]
#         chunk = chunk[chunk.domain=="suspilne.media"]
        if not chunk.empty:
            chunk['text'] = chunk.text.apply(lambda x: text_maker.handle(x) if pd.notnull(x) else None)
            df_parts.append(chunk)
        del chunk
        print(k)
        k += 1
    df = pd.concat(df_parts)
    df = df.reset_index(drop=True)
    df.to_csv(news_file, index=False)
    return df

#### Removing duplicates by link (same articles but in different languages)

In [8]:
# df = df.drop_duplicates('link')

focus_link_pattern = ('https://focus.ua/(?:uk/)?[\w\-]+/(\d+)\-.*', '/uk/')
censor_link_pattern = ('https://censor.net/(?:ua|ru)/\w+/(\d+)/.*', '/ua/')

def remove_duplicates(df, link_pattern_tuple):
    df['article_id'] = df.link.str.extract(link_pattern_tuple[0], expand=False)
    duplicates = df.duplicated('article_id', keep=False) & df.article_id.notna()
    df = df[(~duplicates)|df.link.str.contains(link_pattern_tuple[1])]
    df = df.drop(columns=['article_id'])
    return df

In [9]:
print(news.shape)
news = remove_duplicates(news, focus_link_pattern)
news = remove_duplicates(news, censor_link_pattern)
print(news.shape)

(494687, 16)
(477310, 16)


#### Get rid of html

In [15]:
def get_text_from_html(text):
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    text_maker.unicode_snob = True
    text_maker.body_width = 0
    text_maker.single_line_break = True
    text_maker.ignore_emphasis = True
    return text_maker.handle(text)    

In [16]:
%%time
part = news.iloc[200000:].text.apply(get_text_from_html)

CPU times: user 13min 52s, sys: 13.1 s, total: 14min 5s
Wall time: 14min 14s


In [20]:
news.text.update(part)
del part

Index(['id', 'title', 'text', 'subtitle', 'link', 'domain', 'datetime',
       'views', 'created_at', 'category', 'language', 'pub_type', 'author',
       'tags', 'source', 'author_title', 'domain_alias'],
      dtype='object')

In [26]:
news.to_csv(news_filepath)

#### Clean up wrong characters, newlines and latin/cyrillic character mix

In [4]:
lat_to_cyr_map = {
    'a': 'а',
    'c': 'с',
    'e': 'е',
    'i': 'і',
    'l': 'І',
    'o': 'о',
    'u': 'и',
    'p': 'р',
    'n': 'п',
    'm': 'т',
    'x': 'х',
    'y': 'у',
    'k': 'к',
    'b': 'ь',
    'r': 'г',
    'A': 'А',
    'B': 'В',
    'C': 'С',
    'E': 'Е',
    'H': 'Н',
    'I': 'І',
    'K': 'К',
    'M': 'М',
    'O': 'О',
    'P': 'Р',
    'T': 'Т',
    'X': 'Х',
    'Y': 'У',
    "á": "а́",
    "Á": "А́",
    "é": "е́",
    "É": "Е́",
    "í": "і́",
    "Í": "І́",
    "ḯ": "ї́",
    "Ḯ": "Ї́",
    "ó": "о́",
    "Ó": "О́",
    "ú": "и́",
    "ý": "у́",
    "Ý": "У́",
    "0": "о"
}

cyr_to_lat_map = {}
for k, v in lat_to_cyr_map.items():
    cyr_to_lat_map[v] = k

APOSTROPHY_LIKE = ('”',
                   '‟',
                   '"',
                   '‘',
                   '′',
                   '\u0313',
                   '΄',
                   '’',
                   '´',
                   '`',
                   '’',
                   '?',
                   '*',
                   )
APOSTROPHY_PREFIX = 'бвгґдзкмнпрстфхш'
APOSTROPHY_SUFFIX = 'єїюя'


def remove_part_from_text(title, text):
    if pd.notnull(title) and pd.notnull(text):
        try:
            title = re.escape(title)
            text = re.sub(title, '', text)
        except:
            print(title)
    return text


def remove_newlines(text):
    if pd.notna(text):
        text = re.sub(r"\n \n", "\n", text)
        text = re.sub(r"\r\.?", " ", text)
        text = re.sub(r"\n{2,}", "\n", text)
        text = re.sub(r" {2,}", " ", text)
        text = text.strip()
    return text
    
    
def clean(text):
    if pd.notnull(text):
        text = unicodedata.normalize("NFKC", text)
        text = re.sub(r"\\n", "\n", text)
        text = remove_newlines(text)

        # change strange apostrophe to '
        text = re.sub(r"([{prefix}])[{apostrophy}]([{suffix}])".format(
            prefix=APOSTROPHY_PREFIX, apostrophy=''.join(APOSTROPHY_LIKE), suffix=APOSTROPHY_SUFFIX),
            r"\1'\2", text, flags=re.IGNORECASE)
        text = re.sub(r"([{prefix}])&#39\s?([{suffix}])".format(
            prefix=APOSTROPHY_PREFIX, suffix=APOSTROPHY_SUFFIX), r"\1'\2", text)

        # add space between sentences if needed (with workaround for Цензор.НЕТ and ZN.UA)
        text = re.sub(r"([\.\?\!])(?!(NET|UA|НЕТ))([А-ЯІЇЄҐA-Z])", r"\1 \2\3", text)
       
        # clean up latin/cyrillic character mix
        # cases:
        # - latin symbols that look like cyrillic in ukrainian words
        # - cyrillic symbols that look like latin in english words
        text = re.sub(
            r"([бвгґдєжзийклмнптфцчшщьюяБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ]['’ʼ]?)([aceiopxyunmkbr0ABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])",
            lambda x: x.group(1) + lat_to_cyr_map[x.group(2)], text)

        text = re.sub(
            r"([aceiopxyaceiopxyunmkbr0ABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])(['’ʼ]?[бвгґдєжзийклмнптфцчшщьюяБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ])",
            lambda x: lat_to_cyr_map[x.group(1)] + x.group(2), text)

        text = re.sub(r"([bdfghjklmnrstuvwzDFGJLNQRSUVWZ]['’ʼ]?)([асеіорхуАВСЕНІКМНОРТХУ])",
                      lambda x: x.group(1) + cyr_to_lat_map[x.group(2)], text)

        text = re.sub(r"([асеіорхуАВСЕНІКМНОРТХУ])(['’ʼ]?[bdfghjklmnrstuvwzDFGJLNQRSUVWZ])",
                      lambda x: cyr_to_lat_map[x.group(1)] + x.group(2), text)

        text = re.sub(r"([а-яіїєґА-ЯІЇЄҐ]['’ʼ]?)([aceiopxyunmkbr0ABCEHIKMHOPTXYáÁéÉíÍḯḮóÓúýÝ])(['’ʼ]?[а-яіїєґА-ЯІЇЄҐ])",
                      lambda x: x.group(1) + lat_to_cyr_map[x.group(2)] + x.group(3), text)

        text = re.sub(r"([a-zA-Z]['’ʼ]?)([асеіорхуАВСЕНІКМНОРТХУ])(['’ʼ]?[a-zA-Z])",
                      lambda x: x.group(1) + cyr_to_lat_map[x.group(2)] + x.group(3), text)

        text = re.sub(r"([а-яіїєґ]\W{0,2} )([ayico])( [А-ЯІЇЄҐа-яіїєґ])",
                      lambda x: x.group(1) + lat_to_cyr_map[x.group(2)] + x.group(3), text)

        text = re.sub(r"([AIYBKOl])( [А-ЯІЇЄҐа-яіїєґ])",
                      lambda x: lat_to_cyr_map[x.group(1)] + x.group(2), text)

    return text


def text_cleaning(df):
    df['text'] = df.text.apply(clean)
    df['title'] = df.title.apply(clean)
    df['subtitle'] = df.subtitle.apply(clean)
    df['title'] = df.title.str.replace(r'\n', ' ')
    df['subtitle'] = df.subtitle.str.replace(r'\n', ' ')
    df.title.update(df[df.domain=='https://www.rbc.ua'].title.str.replace(r'^\d+\:\d+', ''))
    df['text'] = df.apply(lambda row: remove_part_from_text(row.title, row.text), axis=1)
    df['text'] = df.apply(lambda row: remove_part_from_text(row.subtitle, row.text), axis=1)
    return df

   

In [25]:
news = text_cleaning(news)



#### Pring some news samples to see redundant text patterns (or save some sample to file)

In [2]:
for d in news.domain.unique():
    print(d, '\n')
    p = news[news.domain==d].sample(1)
    print('title:', p.title.iloc[0])
    print('subtitle:', p.subtitle.iloc[0])
    print('text:', p.text.iloc[0], '\n')

In [9]:
def get_samples(news, domains = [], sample_size = 40, month=0, lang = '', output_file = '../data/new_samples.csv'):
    news_samples = []
    lang_mask = news.lang==lang if lang else True
    period_mask = news.datetime.dt.month==month
    for d in domains:
        try:
            news_samples.append(news[(news.domain==d)&period_mask&lang_mask].sample(sample_size))
        except:
            pass
    news_samples = pd.concat(news_samples)
    news_samples.to_csv(output_file, index=False)
    return news_samples

#### Strip redundant text from news

In [5]:
def mystrip(df):
    for domain, patterns in strip_patterns.items():
        print(domain)
        domain_mask = df.domain.str.contains(domain)
        for part_to_strip in patterns:
            df.text.update(df[domain_mask].text.str.replace(part_to_strip[0], "", flags=part_to_strip[1]))
    df['text'] = df.text.apply(remove_newlines)


In [6]:
mystrip(news)




  


apostrophe.ua
https://www.rbc.ua
https://www.unian.ua
ukrinform.ua
https://censor.net.ua
www.unn.com.ua
https://politeka.net/uk
www.radiosvoboda.org
https://dt.ua/
https://hromadske.ua/
https://www.obozrevatel.com
https://www.segodnya.ua/ua
focus.ua
112.ua
https://www.liga.net
espreso.tv
strana.ua
tsn.ua
https://hromadske.radio
znaj.ua
fakty.com.ua
https://www.epravda.com.ua
https://www.pravda.com.ua
https://ukr.lb.ua
https://ukranews.com
zik.ua
https://ua.korrespondent.net/
vgolos.com.ua
glavcom.ua
24tv.ua
nv.ua
suspilne.media
babel.ua
bykvu.com
golos.ua
vesti.ua
fakty.ua
zaxid.net
kp.ua
telegraf.com.ua
today.ua
gordonua.com
.


#### Set language based on characters counts

In [28]:
# ru = ['https://ukranews.com', 'focus.ua', 'strana.ua', \
#       'https://censor.net.ua', 'nv.ua', 'telegraf.com.ua', \
#       'vesti.ua', 'golos.ua', 'kp.ua']

ru = ['https://ukranews.com', 'strana.ua', \
      'nv.ua', 'telegraf.com.ua', \
      'vesti.ua', 'golos.ua', 'kp.ua']

def set_language(news):
    news.language.mask(news.text.str.contains(r"[іїІЇЄҐґє]", na=False), 'uk', inplace=True)
    news.language.mask(news.domain.str.match(r"|".join(ru)), 'ru', inplace=True)
    news.language.mask(news.text.str.contains(r"([ЫыЁёЭэЪъ].*){5,}", flags=re.S, na=False), 'ru', inplace=True)
    news.language.mask(news.text.str.contains(r"([іїІЇЄҐґє].*){10,}", flags=re.S, na=False), 'uk', inplace=True)
    
    news.language.mask(((news.domain=='https://www.liga.net')&(news.language.isna())), 'ru', inplace=True)
    
    news.language.mask((news.language.isna())&(news.text.str.contains(r"([іїІЇЄҐґє].*){3,}", flags=re.S, na=False)), 'uk', inplace=True)
    news.language.mask((news.language.isna())&(news.text.str.contains(r"([ЫыЁёЭэЪъ].*){3,}", flags=re.S, na=False)), 'ru', inplace=True)
    
    news.language.mask((news.language.isna())&(news.title.str.contains(r"([іїІЇЄҐґє].*){1,}", flags=re.S, na=False)), 'uk', inplace=True)
    news.language.mask((news.language.isna())&(news.title.str.contains(r"([ЫыЁёЭэЪъ].*){1,}", flags=re.S, na=False)), 'ru', inplace=True)
    
    for i, r in news[news.language.isna()&news.text.notna()].iterrows():
        try:
            if detect(r.text)=='en':
                news.language.at[i] = 'en'
        except:
            pass   

In [30]:
set_language(news)

  return func(self, *args, **kwargs)


#### Check news where language is None and set the language by hand or delete them

In [None]:
news[news.language.isna()]

In [None]:
'''Some sample patterns'''

# news = news[news.title!='тест']
# news.loc[news.language.isna()&(news.domain!='www.unn.com.ua'), 'language'] = 'ru'
# news.loc[news.language.isna()&(news.domain=='112.ua'), 'language'] = 'uk'
# news = news[news.language.notna()]

#### Set publication type based on category, tags or text

In [29]:
def get_publication_type(df, pub_type_patterns):
    for key, values in pub_type_patterns.items():
        target_column = key.split('_')[1]
        for domain, patterns in values.items():
            for pat in patterns:
                df.pub_type.mask(df[target_column].str.contains(pat[0], flags=re.I, na=False), pat[1], inplace=True)
                
    return df
            

In [31]:
news = get_publication_type(news, get_publication_type)

In [83]:
news.pub_type.value_counts()

news           459073
publication     10804
blog             7433
Name: pub_type, dtype: int64

In [8]:
news.to_csv(news_filepath)

Index(['title', 'text', 'subtitle', 'link', 'domain', 'datetime', 'views',
       'created_at', 'category', 'language', 'pub_type', 'author', 'tags',
       'source', 'author_title', 'domain_alias'],
      dtype='object')