In [3]:
from transformers import AutoTokenizer, EncoderDecoderModel, TrainingArguments, Trainer
import evaluate # Using evaluate library, which is the successor to datasets.load_metric

# --- 1. Tokenizer Initialization ---
# Ensure the tokenizer is loaded correctly.
# For 'bert-base-uncased', cls_token_id, pad_token_id, and sep_token_id should be available.
tokenizer_name = 'bert-base-uncased'
try:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    print("Please ensure you have a working internet connection and the tokenizer name is correct.")
    exit()

# Verify essential token IDs are present
if tokenizer.cls_token_id is None:
    raise ValueError(f"Tokenizer ({tokenizer_name}) is missing a CLS token ID. Please check tokenizer configuration.")
if tokenizer.pad_token_id is None:
    # If pad_token is not set, add it. BERT typically has [PAD] token.
    # Some models might require explicit addition if not default.
    # For bert-base-uncased, it should be there.
    # If tokenizer.pad_token is None, you might need to add it:
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # model.resize_token_embeddings(len(tokenizer)) # If vocab changes
    raise ValueError(f"Tokenizer ({tokenizer_name}) is missing a PAD token ID.")
if tokenizer.sep_token_id is None:
    raise ValueError(f"Tokenizer ({tokenizer_name}) is missing a SEP token ID (often used as EOS).")

print(f"Tokenizer successfully loaded: {tokenizer_name}")
print(f"CLS token: {tokenizer.cls_token}, ID: {tokenizer.cls_token_id}")
print(f"PAD token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
print(f"SEP token: {tokenizer.sep_token}, ID: {tokenizer.sep_token_id}")
print(f"Special tokens map: {tokenizer.special_tokens_map}")


# --- 2. Model Initialization and Configuration ---
model_name = 'bert-base-uncased'
try:
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you have a working internet connection and the model names are correct.")
    exit()

# Configure the model for generation
# These settings are crucial for EncoderDecoderModel
model.config.decoder_start_token_id = tokenizer.cls_token_id # Use CLS token as the start of decoding
model.config.eos_token_id = tokenizer.sep_token_id # Use SEP token as the end of sequence token
model.config.pad_token_id = tokenizer.pad_token_id # Use PAD token for padding

# Ensure vocab size is consistent
model.config.vocab_size = model.config.encoder.vocab_size

# Optional: Set other generation parameters if needed
model.config.max_length = 128 # Default max length for generation
model.config.early_stopping = True
model.config.num_beams = 4
# model.config.no_repeat_ngram_size = 3 # Example: to prevent repetition

print("Model successfully loaded and configured for generation.")
print(f"Model decoder_start_token_id: {model.config.decoder_start_token_id}")
print(f"Model eos_token_id: {model.config.eos_token_id}")
print(f"Model pad_token_id: {model.config.pad_token_id}")

import datasets
dataset_multi_news = datasets.load_dataset("alexfabbri/multi_news", data_dir="content/datset",trust_remote_code=True)
def tokenize(data):
  input = tokenizer(data['document'],padding='max_length',truncation=True,max_length=512)
  output = tokenizer(data['summary'],padding ='max_length', truncation=True,max_length=128)
  input['labels'] = output['input_ids']
  #return input,output
  return input
dataset_multi_news_tokenize = dataset_multi_news.map(tokenize,batched=True)
train_dataset = dataset_multi_news_tokenize['train'].select(range(100))
validation_dataset = dataset_multi_news_tokenize['validation'].select(range(10))

# Minimal tokenization for compute_metrics (your actual preprocessing will be more complex for training)
def tokenize_for_compute_metrics(examples):
    # This is just for the model.generate() call in compute_metrics
    # Training requires 'labels' as well.
    return tokenizer(examples['document'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')


# --- 4. Training Arguments ---
training_args = TrainingArguments(
    output_dir='./results_corrected',
    # evaluation_strategy = 'epoch', # Uncomment if you have a proper eval_dataset for Trainer
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=2, # Reduced for dummy data, adjust as needed
    per_device_eval_batch_size=2,  # Reduced for dummy data
    num_train_epochs=1, # Reduced for dummy data
    weight_decay=0.01,
    report_to="none",
    # It's good practice to set this for seq2seq tasks, though Trainer might infer it.
    #predict_with_generate=True,
)

# --- 5. Metric Computation Function ---
rouge_metric = evaluate.load("rouge")

def compute_metrics_corrected(eval_pred):
    # This function is for Trainer's eval loop if you use evaluation_strategy='epoch'
    # It expects predictions and labels from the model's output during evaluation.
    predictions, labels = eval_pred
    
    # Decode predictions
    # Predictions are token IDs, -100 indicates tokens to ignore (padding)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects a list of strings for predictions and references
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract ROUGE f1 scores
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# Standalone function to check generation (like your original one)
def evaluate_generation(model_to_eval, dataset_to_eval, num_examples=2):
    predictions_gen = []
    references_gen = []
    
    print(f"\nGenerating summaries for {num_examples} examples from the dataset...")
    for i, example in enumerate(dataset_to_eval.select(range(min(num_examples, len(dataset_to_eval))))):
        # Tokenize the input document
        # Note: Ensure your tokenizer and model are on the same device (e.g., 'cuda' if using GPU)
        # For simplicity, this example assumes CPU.
        inputs = tokenizer(example['document'], return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        input_ids = inputs.input_ids
        
        # Generate summary
        # The model's config for max_length, num_beams etc. will be used by default
        # if not overridden here.
        try:
            output_ids = model_to_eval.generate(input_ids) 
        except Exception as e:
            print(f"Error during model.generate: {e}")
            print(f"Input IDs shape: {input_ids.shape}")
            print(f"Model config: decoder_start_token_id={model_to_eval.config.decoder_start_token_id}, bos_token_id={model_to_eval.config.bos_token_id}")
            raise e

        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        predictions_gen.append(summary)
        references_gen.append(example['summary'])
        
        if i < 2 : # Print first few examples
             print(f"Document  : {example['document']}")
             print(f"Reference : {example['summary']}")
             print(f"Generated : {summary}\n")

    # Compute ROUGE score for the generated samples
    # The original code had this inside the loop, it should be outside.
    if predictions_gen and references_gen:
        result = rouge_metric.compute(predictions=predictions_gen, references=references_gen, use_stemmer=True)
        # Make results more readable
        result_percent = {key: value * 100 for key, value in result.items()}
        return result_percent
    else:
        return {"message": "Not enough data to compute ROUGE."}


# --- 6. Trainer Initialization ---
# For actual training, your datasets need to be properly tokenized with 'input_ids', 'attention_mask', and 'labels'
# The dummy datasets here are NOT suitable for actual training without further preprocessing.
# If you only want to test generation, you can skip trainer.train() and just use evaluate_generation.

# To make this runnable with dummy data, we'll use a simplified train_dataset that just has 'document' and 'summary'
# The Trainer will likely complain if 'labels' are not present or not correctly formatted.
# For a real run, ensure your train_dataset and validation_dataset are fully preprocessed.
# For now, we'll comment out the trainer parts that require fully processed data.

print("\n--- Evaluation before fine-tuning (using standalone generation function) ---")
# Ensure validation_dataset is suitable for the evaluate_generation function
# It expects 'document' and 'summary' fields.
initial_metrics = evaluate_generation(model, validation_dataset, num_examples=2)
print(f"Validation ROUGE before fine-tuning (on {len(validation_dataset.select(range(2)))} examples): {initial_metrics}")

# --- Training (Commented out if datasets are not fully preprocessed) ---
# print("\n--- Starting Training (requires fully preprocessed datasets) ---")
# print("NOTE: If using dummy datasets, training might fail or produce meaningless results.")
# print("Ensure 'train_dataset' and 'validation_dataset' are properly tokenized with 'input_ids', 'attention_mask', and 'labels'.")

# To run training, you would need to uncomment the following and ensure datasets are ready:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset, # Must be preprocessed with 'input_ids', 'attention_mask', 'labels'
#     eval_dataset=validation_dataset, # Must be preprocessed similarly
#     tokenizer=tokenizer,
#     # compute_metrics=compute_metrics_corrected, # Use this if evaluation_strategy is set
# )
#
# try:
#     # trainer.train() # Uncomment to run training
#     print("Training would be run here if uncommented and datasets were ready.") 
# except Exception as e:
#     print(f"Error during training: {e}")
#     print("This might be due to dataset format. Ensure 'input_ids', 'attention_mask', 'labels' are present.")


# --- Evaluation after fine-tuning (Placeholder) ---
# print("\n--- Evaluation after fine-tuning (using standalone generation function) ---")
# This would be called after trainer.train() completes
# final_metrics = evaluate_generation(model, validation_dataset, num_examples=2)
# print(f"Validation ROUGE after fine-tuning (on {len(validation_dataset.select(range(2)))} examples): {final_metrics}")

print("\nScript finished. If training was skipped, model is in its initial state.")
print("To perform actual training, ensure datasets are fully preprocessed and uncomment the trainer.train() call.")



Tokenizer successfully loaded: bert-base-uncased
CLS token: [CLS], ID: 101
PAD token: [PAD], ID: 0
SEP token: [SEP], ID: 102
Special tokens map: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

Model successfully loaded and configured for generation.
Model decoder_start_token_id: 101
Model eos_token_id: 102
Model pad_token_id: 0


Map: 100%|██████████| 5622/5622 [00:11<00:00, 501.87 examples/s]



--- Evaluation before fine-tuning (using standalone generation function) ---

Generating summaries for 2 examples from the dataset...




Document  : Whether a sign of a good read; or a comment on the 'pulp' nature of some genres of fiction, the Oxfam second-hand book charts have remained in The Da Vinci Code author's favour for the past four years. 
 
 Dan Brown has topped Oxfam's 'most donated' list again, his fourth consecutive year. Having sold more than 80 million copies of The Da Vinci Code and had all four of his novels on the New York Times bestseller list in the same week, it's hardly surprising that Brown's hefty tomes are being donated to charity by readers keen to make some room on their shelves. 
 
 Another cult crime writer responsible to heavy-weight hardbacks, Stieg Larsson, is Oxfam's 'most sold' author for the second time in a row. Both the 'most donated' and 'most sold' lists are dominated by crime fiction, trilogies and fantasy, with JK Rowling the only female author listed in either of the Top Fives. 
 
 Click here or on "View Gallery" to see both charts in pictures ||||| A woman reads a copy of the 