In [2]:
from transformers import BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel, BertTokenizer
from datasets import Dataset
import pandas as pd
import numpy as np
from rouge import Rouge

In [3]:
# loading the headline reconstruction model
model = EncoderDecoderModel.from_pretrained('../saved-models/bert2bert/headline-reconstruction')

In [4]:
data = pd.read_csv('../data/export-articles-de-2021-07-15T16:18:00.716875+00:00')
data = data[['HEADLINE', 'CONTENT']]

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

In [14]:
# use model to reconstruct the headlines
original = data['HEADLINE'].head(10).tolist()

original_tokens = tokenizer(original, padding='max_length', truncation=True, max_length=256, add_special_tokens=True, return_tensors="pt")

reconstructed_tokens = model.generate(original_tokens.input_ids, attention_mask=original_tokens.attention_mask, max_length=256)
reconstructed = tokenizer.batch_decode(reconstructed_tokens, skip_special_tokens=True)

In [15]:
# evaluate the rouge scores of headline reconstructions
rouge = Rouge()
rouge.get_scores(original, reconstructed, avg=True)

{'rouge-1': {'r': 0.48312168902648783,
  'p': 0.8517016317016317,
  'f': 0.6133223463745406},
 'rouge-2': {'r': 0.21344056733701916,
  'p': 0.7941666666666667,
  'f': 0.33519357700307983},
 'rouge-l': {'r': 0.48312168902648783,
  'p': 0.8517016317016317,
  'f': 0.6133223463745406}}

In [11]:
# evaluate the same model for application on content reconstruction
# model was trained on headline reconstruction
# using content of the articles, which is cut after 256 tokens
original = data['CONTENT'].head(10).tolist()

original_tokens = tokenizer(original, padding='max_length', truncation=True, max_length=256, add_special_tokens=True, return_tensors="pt")

reconstructed_tokens = model.generate(original_tokens.input_ids, attention_mask=original_tokens.attention_mask, max_length=256)
reconstructed = tokenizer.batch_decode(reconstructed_tokens, skip_special_tokens=True)

rouge.get_scores(original, reconstructed, avg=True)

NameError: name 'rouge' is not defined

In [16]:
# loading the content reconstruction model
model = EncoderDecoderModel.from_pretrained('../saved-models/bert2bert/content-reconstruction/v2')
original = data['CONTENT'].head(10).tolist()

original_tokens = tokenizer(original, padding='max_length', truncation=True, max_length=512, add_special_tokens=True, return_tensors="pt")

reconstructed_tokens = model.generate(original_tokens.input_ids, attention_mask=original_tokens.attention_mask)
reconstructed = tokenizer.batch_decode(reconstructed_tokens, skip_special_tokens=True)

rouge.get_scores(original, reconstructed, avg=True)

{'rouge-1': {'r': 0.8656564231223868,
  'p': 0.8808085940088899,
  'f': 0.8683466386017885},
 'rouge-2': {'r': 0.7098209878434618,
  'p': 0.8402234341968073,
  'f': 0.7524495306424844},
 'rouge-l': {'r': 0.8656564231223868,
  'p': 0.8808085940088899,
  'f': 0.8683466386017885}}

In [20]:
reconstructed_tokens.size()

torch.Size([10, 512])