In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from tqdm import tqdm
from transformers import MBart50TokenizerFast
import pandas as pd
from modules.benchmark import GeneratedHeadlinesBenchmark
from modules.new_model import My_MBart
import numpy as np
from collections import defaultdict
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
bench = GeneratedHeadlinesBenchmark()

Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ru-en-RoSBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
test_data = pd.read_csv('data/new_test_dataset.csv')

In [6]:
test_data.shape

(3804, 3)

In [9]:
def average_metrics(results_list):
    aggregated_results = {}

    # Проходим по каждому словарю в списке
    for result in results_list:
        for key, value in result.items():
            if isinstance(value, dict):  # Если значение — вложенный словарь
                if key not in aggregated_results:
                    aggregated_results[key] = {}
                for sub_key, sub_value in value.items():
                    if sub_key not in aggregated_results[key]:
                        aggregated_results[key][sub_key] = []
                    aggregated_results[key][sub_key].append(sub_value)
            else:  # Если значение — число (метрика)
                if key not in aggregated_results:
                    aggregated_results[key] = []
                aggregated_results[key].append(value)

    # Усредняем все числовые значения
    for key in aggregated_results:
        if isinstance(aggregated_results[key], dict):  # Вложенный словарь
            for sub_key in aggregated_results[key]:
                aggregated_results[key][sub_key] = np.mean(aggregated_results[key][sub_key])
        else:
            aggregated_results[key] = np.mean(aggregated_results[key])

    return aggregated_results


In [10]:
def dict_to_markdown(data, model_name):
    # Извлекаем все ключи
    headers = []
    values = []
    
    # Заполняем данные
    for key, value in data.items():
        # if isinstance(value, dict):
        #     for sub_key, sub_value in value.items():
        #         headers.append(f"{key} - {sub_key}")
        #         values.append(f"{sub_value['mean']:.6f} +- {sub_value['std']:.6f}")
        # else:
        headers.append(key)
        values.append(f"{value['mean']:.6f} +- {value['std']:.6f}")
    
    # Формируем таблицу
    markdown_table = "| Model | " + " | ".join(headers) + " |\n"
    markdown_table += "| ----- | " + " | ".join(["-" * len(h) for h in headers]) + " |\n"
    markdown_table += "| " + model_name + "| " + " | ".join(values) + " |\n"
    
    return markdown_table

In [11]:
def mean_std(metrics_list):
    # Соберем все значения по ключам
    aggregated = defaultdict(list)

    for metrics in metrics_list:
        for k, v in metrics['Rouge'].items():
            aggregated[k].append(v)
        aggregated['Meteor'].append(metrics['Meteor'])
        aggregated['Cider'].append(metrics['Cider'])
        for k, v in metrics['CS_CR'].items():
            aggregated[k].append(v)

    # Посчитаем mean и std
    results = {}
    for key, values in aggregated.items():
        values_np = np.array(values)
        results[key] = {
            'mean': float(np.mean(values_np)),
            'std': float(np.std(values_np))
        }
    return results

In [None]:
seeds = [42]

In [15]:
cluster_centers = pd.read_csv('./data/cluster_centers_new.csv')
cluster_centers = cluster_centers.drop('cluster_id', axis=1)
cluster_centers = cluster_centers.values

In [16]:
def cluster_metadata(cluster_id):
    cluster_info = f"Кластер: {cluster_id} | Ключевые слова: {', '.join(cluster_keyword[cluster_id])}"
    return cluster_info

In [None]:
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50")

## Default Baseline

In [31]:
baseline_seed_metrics = []
for seed in seeds:
    model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_baseline_seed{seed}")
    model = model.to('cuda')
    results = []
    for text, title in tqdm(zip(test_data['text'], test_data['title'])):
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
        inputs = tokenizer(
            [text],
            max_length=600,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # print(inputs['input_ids'])
        output = model.generate(
            input_ids=inputs['input_ids'].to('cuda'),
            attention_mask=inputs['attention_mask'].to('cuda'),
            max_length=128,
        )
        
        headline = tokenizer.decode(output[0], skip_special_tokens=True)

        results.append(bench.calculate_metrics(text, title, headline))
    average_results = average_metrics(results)
    baseline_seed_metrics.append(average_results)
        

3804it [17:59,  3.53it/s]


In [34]:
mean_std(baseline_seed_metrics)

{'ROUGE-1': {'mean': 0.16395110958435435, 'std': 0.0},
 'ROUGE-2': {'mean': 0.0782541440047328, 'std': 0.0},
 'ROUGE-L': {'mean': 0.15795206982475365, 'std': 0.0},
 'Meteor': {'mean': 0.1425000390236755, 'std': 0.0},
 'Cider': {'mean': 0.11578667793528173, 'std': 0.0},
 'Cosine Similarity': {'mean': 0.6182661373109472, 'std': 0.0},
 'Conseptual Relevance': {'mean': 0.9492792911008197, 'std': 0.0}}

In [35]:
print(dict_to_markdown(mean_std(baseline_seed_metrics), 'baseline'))

| Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Meteor | Cider | Cosine Similarity | Conseptual Relevance |
| ----- | ------- | ------- | ------- | ------ | ----- | ----------------- | -------------------- |
| baseline| 0.163951 +- 0.000000 | 0.078254 +- 0.000000 | 0.157952 +- 0.000000 | 0.142500 +- 0.000000 | 0.115787 +- 0.000000 | 0.618266 +- 0.000000 | 0.949279 +- 0.000000 |



## Inject cluster

### Inject cluster type 0

In [37]:
seed_metrics = []
for seed in seeds:
    model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_0_seed{seed}")
    model = model.to('cuda')
    results = []
    for text, cluster, title in tqdm(zip(test_data['text'], test_data['cluster'], test_data['title'])):
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
        inputs = tokenizer(
            [text],
            max_length=600,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        output = model.generate(
            input_ids=inputs['input_ids'].to('cuda'),
            attention_mask=inputs['attention_mask'].to('cuda'),
            meta_embs=torch.Tensor([cluster_centers[cluster]]).to('cuda'),
            max_length=128,
        )
        
        headline = tokenizer.decode(output[0], skip_special_tokens=True)

        results.append(bench.calculate_metrics(text, title, headline))
    average_results = average_metrics(results)
    seed_metrics.append(average_results)
        

  meta_embs=torch.Tensor([cluster_centers[cluster]]).to('cuda'),
3804it [18:11,  3.49it/s]


In [38]:
mean_std(seed_metrics)

{'ROUGE-1': {'mean': 0.15068339673212125, 'std': 0.0},
 'ROUGE-2': {'mean': 0.06754238706297275, 'std': 0.0},
 'ROUGE-L': {'mean': 0.14554023039671127, 'std': 0.0},
 'Meteor': {'mean': 0.12777954331616564, 'std': 0.0},
 'Cider': {'mean': 0.10057767049087663, 'std': 0.0},
 'Cosine Similarity': {'mean': 0.6027991571762333, 'std': 0.0},
 'Conseptual Relevance': {'mean': 0.9345790706864956, 'std': 0.0}}

In [39]:
print(dict_to_markdown(mean_std(seed_metrics), 'add_cluster_type_0'))

| Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Meteor | Cider | Cosine Similarity | Conseptual Relevance |
| ----- | ------- | ------- | ------- | ------ | ----- | ----------------- | -------------------- |
| add_cluster_type_0| 0.150683 +- 0.000000 | 0.067542 +- 0.000000 | 0.145540 +- 0.000000 | 0.127780 +- 0.000000 | 0.100578 +- 0.000000 | 0.602799 +- 0.000000 | 0.934579 +- 0.000000 |



### Inject cluster type 1

In [19]:
seed_metrics_1 = []
for seed in seeds:
    model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_1_seed{seed}")
    model = model.to('cuda')
    results = []
    for text, cluster, title in tqdm(zip(test_data['text'], test_data['cluster'], test_data['title'])):
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
        inputs = tokenizer(
            [text],
            max_length=600,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        output = model.generate(
            input_ids=inputs['input_ids'].to('cuda'),
            attention_mask=inputs['attention_mask'].to('cuda'),
            meta_embs=torch.Tensor([cluster_centers[cluster]]).to('cuda'),
            max_length=128,
        )
        
        headline = tokenizer.decode(output[0], skip_special_tokens=True)

        results.append(bench.calculate_metrics(text, title, headline))
    average_results = average_metrics(results)
    seed_metrics_1.append(average_results)
        

  meta_embs=torch.Tensor([cluster_centers[cluster]]).to('cuda'),
3804it [17:06,  3.71it/s]


In [20]:
mean_std(seed_metrics_1)

{'ROUGE-1': {'mean': 0.18111516029655053, 'std': 0.0},
 'ROUGE-2': {'mean': 0.09009259794078926, 'std': 0.0},
 'ROUGE-L': {'mean': 0.17393819945544725, 'std': 0.0},
 'Meteor': {'mean': 0.16055247105241818, 'std': 0.0},
 'Cider': {'mean': 0.13565889822174085, 'std': 0.0},
 'Cosine Similarity': {'mean': 0.6338736785793154, 'std': 0.0},
 'Conseptual Relevance': {'mean': 0.9587906478409188, 'std': 0.0}}

In [21]:
print(dict_to_markdown(mean_std(seed_metrics_1), 'add_cluster_type_1'))

| Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Meteor | Cider | Cosine Similarity | Conseptual Relevance |
| ----- | ------- | ------- | ------- | ------ | ----- | ----------------- | -------------------- |
| add_cluster_type_1| 0.181115 +- 0.000000 | 0.090093 +- 0.000000 | 0.173938 +- 0.000000 | 0.160552 +- 0.000000 | 0.135659 +- 0.000000 | 0.633874 +- 0.000000 | 0.958791 +- 0.000000 |



### Inject cluster type 2

In [22]:
seed_metrics_2 = []
for seed in seeds:
    model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_2_seed{seed}")
    model = model.to('cuda')
    results = []
    for text, cluster, title in tqdm(zip(test_data['text'], test_data['cluster'], test_data['title'])):
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
        inputs = tokenizer(
            [text],
            max_length=600,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        output = model.generate(
            input_ids=inputs['input_ids'].to('cuda'),
            attention_mask=inputs['attention_mask'].to('cuda'),
            meta_embs=torch.Tensor([cluster_centers[cluster]]).to('cuda'),
            max_length=128,
        )
        
        headline = tokenizer.decode(output[0], skip_special_tokens=True)

        results.append(bench.calculate_metrics(text, title, headline))
    average_results = average_metrics(results)
    seed_metrics_2.append(average_results)
        

3804it [19:24,  3.27it/s]


In [24]:
mean_std(seed_metrics_2)

{'ROUGE-1': {'mean': 0.18546602598337764, 'std': 0.0},
 'ROUGE-2': {'mean': 0.09422155396520057, 'std': 0.0},
 'ROUGE-L': {'mean': 0.17835019975468192, 'std': 0.0},
 'Meteor': {'mean': 0.1635646489625219, 'std': 0.0},
 'Cider': {'mean': 0.14005738312127403, 'std': 0.0},
 'Cosine Similarity': {'mean': 0.6372775763320246, 'std': 0.0},
 'Conseptual Relevance': {'mean': 0.9606065979546671, 'std': 0.0}}

In [25]:
print(dict_to_markdown(mean_std(seed_metrics_2), 'add_cluster_type_2'))

| Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Meteor | Cider | Cosine Similarity | Conseptual Relevance |
| ----- | ------- | ------- | ------- | ------ | ----- | ----------------- | -------------------- |
| add_cluster_type_2| 0.185466 +- 0.000000 | 0.094222 +- 0.000000 | 0.178350 +- 0.000000 | 0.163565 +- 0.000000 | 0.140057 +- 0.000000 | 0.637278 +- 0.000000 | 0.960607 +- 0.000000 |



### Inject cluster type 3

In [28]:
seed_metrics_3 = []
for seed in seeds:
    model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_3_seed{seed}")
    model = model.to('cuda')
    results = []
    for text, cluster, title in tqdm(zip(test_data['text'], test_data['cluster'], test_data['title'])):
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
        inputs = tokenizer(
            [text],
            max_length=600,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        output = model.generate(
            input_ids=inputs['input_ids'].to('cuda'),
            attention_mask=inputs['attention_mask'].to('cuda'),
            meta_embs=torch.Tensor([cluster_centers[cluster]]).to('cuda'),
            max_length=128,
        )
        
        headline = tokenizer.decode(output[0], skip_special_tokens=True)

        results.append(bench.calculate_metrics(text, title, headline))
    average_results = average_metrics(results)
    seed_metrics_3.append(average_results)
        

3804it [18:12,  3.48it/s]


In [29]:
mean_std(seed_metrics_3)

{'ROUGE-1': {'mean': 0.1676581930189058, 'std': 0.0},
 'ROUGE-2': {'mean': 0.08088196856463964, 'std': 0.0},
 'ROUGE-L': {'mean': 0.1614964434334131, 'std': 0.0},
 'Meteor': {'mean': 0.14532582525254809, 'std': 0.0},
 'Cider': {'mean': 0.11850514268053758, 'std': 0.0},
 'Cosine Similarity': {'mean': 0.6196152599906445, 'std': 0.0},
 'Conseptual Relevance': {'mean': 0.9494767407316153, 'std': 0.0}}

In [31]:
print(dict_to_markdown(mean_std(seed_metrics_3), 'add_cluster_type_3'))

| Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Meteor | Cider | Cosine Similarity | Conseptual Relevance |
| ----- | ------- | ------- | ------- | ------ | ----- | ----------------- | -------------------- |
| add_cluster_type_3| 0.167658 +- 0.000000 | 0.080882 +- 0.000000 | 0.161496 +- 0.000000 | 0.145326 +- 0.000000 | 0.118505 +- 0.000000 | 0.619615 +- 0.000000 | 0.949477 +- 0.000000 |



## Ручные тесты

In [19]:
baseline_model = My_MBart.from_pretrained("models/ft-bart-headline-generation_baseline_seed42")
baseline_model = baseline_model.to('cuda')

add_cluster_0_model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_0_seed42")
add_cluster_0_model = add_cluster_0_model.to('cuda')

add_cluster_1_model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_1_seed42")
add_cluster_1_model = add_cluster_1_model.to('cuda')

add_cluster_2_model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_2_seed42")
add_cluster_2_model = add_cluster_2_model.to('cuda')

add_cluster_3_model = My_MBart.from_pretrained(f"models/ft-bart-headline-generation_add_cluster_3_seed42")
add_cluster_3_model = add_cluster_3_model.to('cuda')

In [20]:
test_data

Unnamed: 0,title,text,cluster
0,«Слабее умом»: в РПЦ рассказали об отношении к...,Большинство мужчин умнее женщин — такое мнение...,19
1,Стажировка в JetBrains и как мне почти удалось...,"Как и многие молодые разработчики, когда появл...",8
2,Угроза Меркель: мигранты раскололи правительст...,В блоке партий «Христианско-демократический со...,0
3,"3 слова, досуг и свободные номера",Banjo в Google Play цена: бесплатно После успе...,19
4,Газ — по расписанию,Обострение российско-украинского конфликта ста...,6
...,...,...,...
3799,Евросоюзу показали «синий язык»,Европейский союз пожаловался в ВТО на ряд прот...,6
3800,С моцартианской легкостью по красной дорожке,Само появление имени Брука в афише NET многих ...,1
3801,Гонят из России? Алсу в мечтах об украинском т...,Популярная российская певица Алсу сделала гром...,14
3802,Одноразовый преемник,Экс-кандидат в президенты Южной Осетии Анатоли...,0


In [51]:
def gen_titles(idx):
    prompt_inputs = tokenizer(
        [test_data.iloc[idx].text],
        max_length=1024,
        add_special_tokens=True,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    default_inputs = tokenizer(
        [test_data.iloc[idx].text],
        max_length=1024,
        add_special_tokens=True,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    baseline_output_ids = baseline_model.generate(
        input_ids=default_inputs['input_ids'].to('cuda'),
        attention_mask=default_inputs['attention_mask'].to('cuda'),
        max_length=128
    )[0]

    type_0_ids = add_cluster_0_model.generate(
        input_ids=default_inputs['input_ids'].to('cuda'),
        attention_mask=default_inputs['attention_mask'].to('cuda'),
        meta_embs=torch.Tensor([cluster_centers[test_data.iloc[idx].cluster]]).to('cuda'),
        max_length=128
    )[0]

    type_1_ids = add_cluster_1_model.generate(
        input_ids=default_inputs['input_ids'].to('cuda'),
        attention_mask=default_inputs['attention_mask'].to('cuda'),
        meta_embs=torch.Tensor([cluster_centers[test_data.iloc[idx].cluster]]).to('cuda'),
        max_length=128
    )[0]

    type_2_ids = add_cluster_2_model.generate(
        input_ids=default_inputs['input_ids'].to('cuda'),
        attention_mask=default_inputs['attention_mask'].to('cuda'),
        meta_embs=torch.Tensor([cluster_centers[test_data.iloc[idx].cluster]]).to('cuda'),
        max_length=128
    )[0]

    type_3_ids = add_cluster_3_model.generate(
        input_ids=default_inputs['input_ids'].to('cuda'),
        attention_mask=default_inputs['attention_mask'].to('cuda'),
        meta_embs=torch.Tensor([cluster_centers[test_data.iloc[idx].cluster]]).to('cuda'),
        max_length=128
    )[0]

    # headline = tokenizer.decode(output_ids, skip_special_tokens=True)
    print("Generated headlines:")
    print("\tBaseline:", tokenizer.decode(baseline_output_ids, skip_special_tokens=True))
    print("\tType 0:", tokenizer.decode(type_0_ids, skip_special_tokens=True))
    print("\tType 1:", tokenizer.decode(type_1_ids, skip_special_tokens=True))
    print("\tType 2:", tokenizer.decode(type_2_ids, skip_special_tokens=True))
    print("\tType 3:", tokenizer.decode(type_3_ids, skip_special_tokens=True))


In [297]:
idx = 1831
gen_titles(idx)
print("True headline:", test_data.iloc[idx].title)

Generated headlines:
	Baseline: Адоболи обманул боссов
	Type 0: «У него есть чутье»: как Адоболи обманул боссов
	Type 1: Убийца UBS попал под домашний арест
	Type 2: Убийца UBS попал под домашний арест
	Type 3: Адоболи обманул боссов
True headline: «Хаос и несчастье для себя и для всех»
