In [1]:
!pip install loguru asyncpg psycopg2 nest_asyncio

Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asyncpg
  Downloading asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: loguru, asyncpg
Successfully installed asyncpg-0.29.0 loguru-0.7.2


In [2]:
import torch

from loguru import logger
import nest_asyncio

from transformers import MBartTokenizer, MBartForConditionalGeneration


import asyncio
import asyncpg


device = "cuda" if torch.cuda.is_available() else "cpu"

DB_USER = 'DB_USER'
DB_NAME = 'DB_NAME'
DB_PASS = "DB_PASS"
DB_HOST = "DB_HOST"
DB_PORT = 5432


# Параметры подключения к базе данных PostgreSQL
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:5432/{DB_NAME}"

# Размер батча для обработки модели суммаризации
BATCH_SIZE = 16

In [3]:
def get_model(model_name: str = "IlyaGusev/mbart_ru_sum_gazeta") -> tuple:
    model_params = {
        "IlyaGusev/mbart_ru_sum_gazeta":
        {
            'tokenizer': MBartTokenizer,
            'model': MBartForConditionalGeneration,
            "column_name": 'news',
            'result_column': 'resume'
        },
    }

    tokenizer = model_params[model_name]['tokenizer'].from_pretrained(model_name)
    model = model_params[model_name]['model'].from_pretrained(model_name)
    model.to(device)
    # column_name = model_params[model_name]['column_name']
    # result_column = model_params[model_name]['result_column']
    return tokenizer, model

tokenizer, model = get_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/406 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.47G [00:00<?, ?B/s]

In [4]:
nest_asyncio.apply()

def summarize_batch(news_texts) -> list:

    input_ids = tokenizer(
        news_texts,
        max_length=600,
        truncation=True,
        padding=True,
        return_tensors="pt", )["input_ids"].to(device)

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4)

    summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

    # logger.info(f'Обработка завершена успешно')
    return summary



async def get_news_without_resume(conn):
    query = """
        SELECT url, news
        FROM newsilcom
        WHERE date::date >= '2023-06-17' and resume IS NULL
        ORDER BY 1 desc
        LIMIT $1
    """
    return await conn.fetch(query, BATCH_SIZE)

async def update_news_resume(conn, news_resumes):
    query = """
        UPDATE newsilcom
        SET resume = $1
        WHERE url = $2
    """
    await conn.executemany(query, news_resumes)

async def process_news_batch(pool):
    async with pool.acquire() as conn:
        news_batch = await get_news_without_resume(conn)
        news_texts = [row['news'] for row in news_batch]
        resumes = summarize_batch(news_texts)
        news_resumes = [(resume, row['url']) for resume, row in zip(resumes, news_batch)]
        await update_news_resume(conn, news_resumes)

async def main():
    pool = await asyncpg.create_pool(DATABASE_URL)
    i = 1
    while True:
        # logger.info(f'Начинается обработка {i} транша')
        # try:
        await process_news_batch(pool)
        await asyncio.sleep(1)  # Задержка между итерациями
        logger.info(f'Завершено выполнение батча {i},  обработано всего {i*BATCH_SIZE} новостей')
        # except ValueError:
        #   logger.info(f'Ошибка в обработке батча {i}')
        i += 1
    await pool.close()


if __name__ == '__main__':
    asyncio.run(main())

[32m2024-07-01 14:29:06.285[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m57[0m - [1mЗавершено выполнение батча 1,  обработано всего 16 новостей[0m
[32m2024-07-01 14:29:29.357[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m57[0m - [1mЗавершено выполнение батча 2,  обработано всего 32 новостей[0m
[32m2024-07-01 14:29:52.489[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m57[0m - [1mЗавершено выполнение батча 3,  обработано всего 48 новостей[0m
[32m2024-07-01 14:30:16.343[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m57[0m - [1mЗавершено выполнение батча 4,  обработано всего 64 новостей[0m
[32m2024-07-01 14:30:39.373[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m57[0m - [1mЗавершено выполнение батча 5,  обработано всего 80 новостей[0m
[32m2024-07-01 14:31:02.614[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m57[0m - [1mЗавершено выполнение батча 6,  обработано всего 96 новос

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []