In [1]:
cd /content/drive/MyDrive/Colab Notebooks/json2news

/content/drive/MyDrive/Colab Notebooks/json2news


In [2]:
!pip install dpath fasttext loguru



In [3]:
import os
import pandas as pd
import datetime as dt
import dpath
import re
import torch
import fasttext as fasttext
from loguru import logger

from transformers import MBartTokenizer, MBartForConditionalGeneration, AutoTokenizer, T5ForConditionalGeneration, \
    AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
blacklist = ['*Власти считают иноагентом', '*Минюст признал иноагентами','*Минюст объявил иноагентом', '18+', '@bbcrussian', '@interfaxonline', '@vedomosti']

In [None]:
def get_news(content_list: list) -> str:
    news_list = [el['text'] for el in content_list]
    draft_news = ''.join(news_list)
    draft_news = draft_news.replace(
        '\nДАННОЕ СООБЩЕНИЕ (МАТЕРИАЛ) СОЗДАНО И (ИЛИ) РАСПРОСТРАНЕНО ИНОСТРАННЫМ СРЕДСТВОМ МАССОВОЙ ИНФОРМАЦИИ, '
        'ВЫПОЛНЯЮЩИМ ФУНКЦИИ ИНОСТРАННОГО АГЕНТА, И (ИЛИ) РОССИЙСКИМ ЮРИДИЧЕСКИМ ЛИЦОМ, ВЫПОЛНЯЮЩИМ ФУНКЦИИ '
        'ИНОСТРАННОГО АГЕНТА\n\n',
        '').replace(
        'Во время войны оперативно проверить информацию, которую распространяют даже официальные представители '
        'конфликтующих сторон, невозможно.',
        '').replace('\n', ' ').strip()
    for label in blacklist:
      draft_news = draft_news.replace(label, ' ')
    clean_news = re.sub(r"\s{2,}", ' ', draft_news)
    return clean_news


def collect_news_attrs(news_dict: dict) -> tuple:
    url = news_dict['id']
    date = dt.datetime.fromisoformat(news_dict['date'])
    try:
        links = dpath.get(news_dict, "text/*/href")
    except:
        links = url
    news = get_news(news_dict['text_entities'])
    return url, news, date, links


def get_df(df: pd.DataFrame, agency_name: str) -> pd.DataFrame:
    df['url'], df['news'], df['date'], df['links'] = zip(*df['messages'].map(collect_news_attrs))
    df['url'] = df['url'].apply(lambda x: agency_name + str(x))
    df.drop(['name', 'type', 'id', 'messages'], axis=1, inplace=True)
    return df


def get_model(model_name: str) -> tuple:
    model_params = {
        "IlyaGusev/mbart_ru_sum_gazeta":
        {
            'tokenizer': MBartTokenizer,
            'model': MBartForConditionalGeneration,
            "column_name": 'news',
            'result_column': 'resume'
        },
        "IlyaGusev/rut5_base_headline_gen_telegram":
            {
              'tokenizer': AutoTokenizer,
              'model': T5ForConditionalGeneration,
              "column_name": 'resume',
              'result_column': 'title'
            }
    }

    tokenizer = model_params[model_name]['tokenizer'].from_pretrained(model_name)
    model = model_params[model_name]['model'].from_pretrained(model_name)
    model.to(device)
    column_name = model_params[model_name]['column_name']
    result_column = model_params[model_name]['result_column']
    return tokenizer, model, column_name, result_column


def get_summary(df: pd.DataFrame, model_name: str, agency_name: str) -> pd.DataFrame:
    tokenizer, model, column_name, result_column = get_model(model_name)

    batch_size = 16
    start = 0
    parents_list = df[column_name].tolist()
    summary_list = []

    epochs_amount = len(parents_list) // batch_size + 1

    for epoch in range(epochs_amount):
        logger.info(f'Эпоха {epoch + 1}/{epochs_amount}')
        current_list = parents_list[start:start + batch_size]

        input_ids = tokenizer(
            current_list,
            max_length=600,
            truncation=True,
            padding=True,
            return_tensors="pt", )["input_ids"].to(device)

        output_ids = model.generate(
            input_ids=input_ids,
            no_repeat_ngram_size=4)

        summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        summary_list.extend(summary)

        start += batch_size
    df[result_column] = summary_list
    df.to_pickle(f'source/{agency_name}_compressed.pkl', compression='gzip')
    logger.info(f'Обработка {model_name} завершена успешно')
    return df


def get_category(df: pd.DataFrame) -> pd.DataFrame:
    model_class = fasttext.load_model("models/cat_model.ftz")
    df['category'] = df.news.apply(lambda x: model_class.predict(x)[0][0].split('__', 2)[-1])
    df = df[(df.category != 'not_news') & (df.category != 'other')]
    logger.info(f'Классификация завершена, категории успешно присвоены')
    return df


def make_me_embs(sentences: list) -> list:
    tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
    model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru").to(device)
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=64, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.tolist()


def get_embs(df: pd.DataFrame, agency_name: str) -> pd.DataFrame:
    list_embs = []
    batch_size = 500
    start = 0
    epochs_amount = len(df) // batch_size + 1

    for epoch in range(epochs_amount):
        logger.info(f'Эпоха {epoch + 1}/{epochs_amount}')
        current_list = df.news[start:start + batch_size].tolist()
        embs = make_me_embs(current_list)
        list_embs.extend(embs)
        start = start + batch_size
    df['embs'] = list_embs
    df.to_pickle(f'source/super_final_{agency_name}.pkl', compression='gzip')
    logger.info(f'Векторизация завершена, эмбеддинги записаны')
    return df


def main(json_news_file: str) -> pd.DataFrame:
    logger.info(f'Начинается обработка json-массива новостей {json_news_file}')
    df_draft = pd.read_json(f'data/{json_news_file}')
    df = get_df(df_draft, json_news_file)
    logger.info(f'Новости записаны в датафрейм, начинается процесс классификации')
    df = get_category(df)
    logger.info(f'Начинается процесс суммаризации')
    model_names = ['IlyaGusev/mbart_ru_sum_gazeta', 'IlyaGusev/rut5_base_headline_gen_telegram']
    for model_name in model_names:
        df = get_summary(df, model_name=model_name, agency_name=json_news_file)
        torch.cuda.empty_cache()
    df = get_embs(df, json_news_file)
    logger.info(
        f'Обработка {json_news_file} успешно завершена, итоговый датафрейм сохранён как data/super_final_compressed.pkl')
    return df

if __name__ == '__main__':
    # main('radiosvoboda.json')
    agencies_news_list = [file for file in os.listdir("data")]
    for agency_news in agencies_news_list:
      main(agency_news)

[32m2024-03-01 18:15:56.889[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m135[0m - [1mНачинается обработка json-массива новостей bbcrussian.json[0m
[32m2024-03-01 18:15:59.593[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m138[0m - [1mНовости записаны в датафрейм, начинается процесс классификации[0m
[32m2024-03-01 18:16:00.881[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_category[0m:[36m101[0m - [1mКлассификация завершена, категории успешно присвоены[0m
[32m2024-03-01 18:16:01.135[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m140[0m - [1mНачинается процесс суммаризации[0m
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recom

tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/828k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/977M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
[32m2024-03-01 19:03:55.182[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_summary[0m:[36m73[0m - [1mЭпоха 1/545[0m
[32m2024-03-01 19:03:57.055[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_summary[0m:[36m73[0m - [1mЭпоха 2/545[0m
[32m2024-03-01 19:03:58.208[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_summary[0m:[36m73[0m - [1mЭпоха 3/545[0m
[32m2024-03-01 19:03:59.316[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_summary[0m:[36m73[0m - [1mЭпоха 4/545[0m
[32m2024-03-01 19:04:00.677[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_summary[0m:[36m73[0m - [1mЭпоха 5/545[0m
[32m2024-03-01 19:04:01.980[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_summary[0m:[36m73[0m - [1mЭпоха 6/545[0m
[32m2024-03-01 19:04:03.384[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_summary[0m:[36m73[0m - [1mЭпоха 7/545[0m
[32m2024-03-01 19:04:04.792[0m | [1mINFO    [0m | [36m__main

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/516M [00:00<?, ?B/s]

[32m2024-03-01 19:18:03.244[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 2/18[0m
[32m2024-03-01 19:18:04.541[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 3/18[0m
[32m2024-03-01 19:18:05.721[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 4/18[0m
[32m2024-03-01 19:18:07.186[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 5/18[0m
[32m2024-03-01 19:18:08.377[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 6/18[0m
[32m2024-03-01 19:18:09.570[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 7/18[0m
[32m2024-03-01 19:18:11.014[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 8/18[0m
[32m2024-03-01 19:18:12.241[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_embs[0m:[36m123[0m - [1mЭпоха 9/18[0m
[32m202