In [1]:
from transformers import BertTokenizer
import datasets
import numpy as np
from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

import os
os.environ["WAND_DISABLED"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!rm -rf ~/.cache/huggingface/datasets

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
!pip install -U datasets




[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
dataset_multi_news = datasets.load_dataset("alexfabbri/multi_news", data_dir="content/datset",trust_remote_code=True)

In [5]:
print(dataset_multi_news['train'].description)
print(dataset_multi_news['train'].features)
print(dataset_multi_news['train'][0])


Multi-News, consists of news articles and human-written summaries
of these articles from the site newser.com.
Each summary is professionally written by editors and
includes links to the original articles cited.

There are two features:
  - document: text of news articles seperated by special token "|||||".
  - summary: news summary.

{'document': Value(dtype='string', id=None), 'summary': Value(dtype='string', id=None)}
{'document': 'National Archives \n \n Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. \n \n A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs ad

In [6]:
print(dataset_multi_news['train'][0]['summary'])

– The unemployment rate dropped to 8.2% last month, but the economy only added 120,000 jobs, when 203,000 new jobs had been predicted, according to today's jobs report. Reaction on the Wall Street Journal's MarketBeat Blog was swift: "Woah!!! Bad number." The unemployment rate, however, is better news; it had been expected to hold steady at 8.3%. But the AP notes that the dip is mostly due to more Americans giving up on seeking employment.


In [7]:
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [8]:
#tokenize the data

def tokenize(data):
  input = tokenizer(data['document'],padding='max_length',truncation=True,max_length=512)
  output = tokenizer(data['summary'],padding ='max_length', truncation=True,max_length=128)
  input['labels'] = output['input_ids']
  #return input,output
  return input


#result_in, result_out = tokenize(dataset_multi_news['train'][0])
#print(len(result_in))
#print(result_out)



In [9]:
type(dataset_multi_news)

datasets.dataset_dict.DatasetDict

In [10]:
dataset_multi_news_tokenize = dataset_multi_news.map(tokenize,batched=True)

Map: 100%|██████████| 44972/44972 [23:40<00:00, 31.67 examples/s]
Map: 100%|██████████| 5622/5622 [02:30<00:00, 37.47 examples/s]
Map: 100%|██████████| 5622/5622 [02:41<00:00, 34.76 examples/s]


In [16]:
train_dataset = dataset_multi_news_tokenize['train'].select(range(100))
validation_dataset = dataset_multi_news_tokenize['validation'].select(range(10))

In [17]:
train_dataset

Dataset({
    features: ['document', 'summary', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [18]:
from transformers import EncoderDecoderModel
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased','bert-base-uncased')
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.config.bos_token_id = tokenizer.cls_token_id

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [19]:
from transformers import TrainingArguments,Trainer

training_args = TrainingArguments(
    output_dir ='./results',
    #evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = 5e-5,
    per_device_train_batch_size= 4,
    per_device_eval_batch_size= 4,
    num_train_epochs =3,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset
)

#trainer.train()

In [20]:
#Eval

import evaluate
rouge_metric = evaluate.load("rouge")

def compute_metrics(model,dataset):
    predictions = []
    references = []

    for example in dataset.select(range(10)):
        input_ids = tokenizer(example['document'],return_tensors = 'pt',padding='max_length').input_ids
        output = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(output[0],skip_special_tokens=False)
        predictions.append(summary)
        references.append(example['summary'])

        result = rouge_metric.compute(predictions=predictions,references=references)

print(f"Validation before fine tuning: {compute_metrics(model,validation_dataset)}")

trainer.train()

print(f"Validation after fine tuning: {compute_metrics(model,validation_dataset)}")

ValueError: `decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.