In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('openai/summarize_from_feedback', 'comparisons')

# Load the fine-tuned model
fine_tuned_model_path = './fine-tuned-gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(fine_tuned_model_path)
fine_tuned_model = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path)

# Load the original GPT-2 model
original_model_path = 'gpt2'
original_model = GPT2LMHeadModel.from_pretrained(original_model_path)
original_tokenizer = GPT2Tokenizer.from_pretrained(original_model_path)

# Move models to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fine_tuned_model.to(device)
original_model.to(device)


  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
def generate_summary(model, tokenizer, text, max_length=45):
    inputs = tokenizer.encode("summarize this following text in at most 30 words in the form a TLDR: ###\n"+text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to(device)
    summary_ids = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(summary)
    return summary

# validation dataset from openai/summarize_from_feedback
val_data = [item for item in dataset['validation']]
val_text = [item['info']['post'] for item in val_data]
# get summary that got preferred by humans. 
# print(val_text[0])

# Generate summaries
fine_tuned_summaries = [generate_summary(fine_tuned_model, tokenizer, text) for text in val_text]
original_summaries = [generate_summary(original_model, original_tokenizer, text) for text in val_text]



In [None]:
from rouge_score import rouge_scorer
import bert_score

def evaluate_summaries(reference_summaries, generated_summaries):
    # ROUGE evaluation
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, gen) for ref, gen in zip(reference_summaries, generated_summaries)]

    # BERTScore evaluation
    P, R, F1 = bert_score.score(generated_summaries, reference_summaries, lang="en", rescale_with_baseline=True)

    return rouge_scores, P, R, F1

# Assuming we have reference summaries for evaluation
reference_summary = []
for i, sample in enumerate(val_data): 
    current_preferred_summary = sample['summaries'][0]['text']
    if sample['choice'] == 1:
        current_preferred_summary = sample['summaries'][1]['text']
    reference_summary.append(current_preferred_summary)

# Evaluate fine-tuned model summaries
fine_tuned_rouge_scores, fine_tuned_P, fine_tuned_R, fine_tuned_F1 = evaluate_summaries(reference_summaries, fine_tuned_summaries)

# Evaluate original model summaries
original_rouge_scores, original_P, original_R, original_F1 = evaluate_summaries(reference_summaries, original_summaries)

# Print ROUGE scores
print("Fine-Tuned Model ROUGE Scores:")
for i, scores in enumerate(fine_tuned_rouge_scores):
    print(f"Text {i + 1}:", scores)

print("\nOriginal Model ROUGE Scores:")
for i, scores in enumerate(original_rouge_scores):
    print(f"Text {i + 1}:", scores)

# Print BERTScores
print("\nFine-Tuned Model BERTScores:")
for i in range(len(fine_tuned_summaries)):
    print(f"Text {i + 1}: Precision={fine_tuned_P[i].item():.4f}, Recall={fine_tuned_R[i].item():.4f}, F1={fine_tuned_F1[i].item():.4f}")

print("\nOriginal Model BERTScores:")
for i in range(len(original_summaries)):
    print(f"Text {i + 1}: Precision={original_P[i].item():.4f}, Recall={original_R[i].item():.4f}, F1={original_F1[i].item():.4f}")
