In [1]:
# for first LLM
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput

# for second
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer

from IPython.display import display, HTML
import torch
from tqdm.notebook import tqdm

# Automatically choose (prefer NVIDIA GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Specify model name
model_name = "facebook/bart-base"
# model_name = "facebook/bart-large" # Recommend this one if your computer is okay with larger models

tokenizer_bart = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

In [2]:
def add_noise_with_snr(encoder_output, target_snr_db):
    """
    Add noise to the encoder output based on a target SNR in dB.
    
    Parameters:
    - encoder_output: torch.Tensor, the encoder's output (last_hidden_state).
    - target_snr_db: float, the desired signal-to-noise ratio in dB.
    
    Returns:
    - noisy_encoder_output: torch.Tensor, encoder output with added noise.
    """
    # Convert SNR from dB to linear scale
    target_snr_linear = 10 ** (target_snr_db / 10)
    
    # Calculate power of the signal
    signal_power = torch.mean(encoder_output ** 2)
    
    # Calculate required noise power for the target SNR
    noise_power = signal_power / target_snr_linear
    noise = torch.randn_like(encoder_output) * torch.sqrt(noise_power)
    
    # Add noise to the encoder output
    noisy_encoder_output = encoder_output + noise
    return noisy_encoder_output

In [5]:
# original_text = """
# Beginners BBQ Class Taking Place in Missoula! 
# Do you want to get better at making delicious BBQ?
# You will have the opportunity, put this on your calendar now. 
# Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. 
# He will be teaching a beginner level class for everyone who wants to get better with their culinary skills. 
# He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information. 
# The cost to be in the class is $35 per person, and for spectators it is free. 
# Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
# """
# input_text = """
# Beginners BBQ Class <mask> in Missoula! 
# Do you want to <mask> making delicious BBQ?
# You will have the opportunity, put this on your calendar now. 
# Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. 
# He will be teaching a beginner level class for everyone who wants to get better with their culinary skills. 
# He will teach you <mask> compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information. 
# The <mask> the class is $35 per person, and for spectators it is free. 
# Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
# """.replace("\n", "")

original_text = """
Welcome to our online coding bootcamp program! 
Whether you're a complete beginner or looking to improve your programming skills, this course is designed for you. 
Throughout the course, you will learn essential coding languages such as Python and JavaScript. 
Our instructors will guide you through interactive projects and provide real-time feedback. 
Each student will receive a certificate of completion at the end of the program. 
The total cost for the bootcamp is $150, which includes all learning materials.
"""
input_text = """
Welcome to our online <mask> bootcamp program! 
Whether you're a complete beginner or looking to <mask> your programming skills, this course is designed for you. 
Throughout the course, you will learn essential <mask> such as Python and JavaScript. 
Our instructors will guide you through interactive projects and provide real-time <mask>. 
Each student will receive a certificate of completion at the end of the <mask>. 
The total cost for the bootcamp is $150, which <mask> all learning materials.
""".replace("\n", "")

# # Generate output with the clean encoder output (latent reprenstation)
input_ids = tokenizer_bart(input_text, return_tensors="pt").input_ids.to(device)

with torch.no_grad():
    encoder_outputs = model.model.encoder(input_ids=input_ids)

# noisy_encoder_output = add_noise_with_snr(encoder_outputs.last_hidden_state, -6)
# encoder_outputs = BaseModelOutput(last_hidden_state=noisy_encoder_output)
    
baseline_outputs = model.generate(
    input_ids=None,                   # No input tokens are provided here, as we're feeding encoder outputs directly
    encoder_outputs=encoder_outputs,  # Encoded representations from the encoder
    max_length=300,                   # Set maximum length for the generated text sequence
    min_length=10,                    # Set minimum length for the generated text sequence
    do_sample=True,                   # Enables sampling for diverse outputs, rather than greedy decoding
    num_beams=15,
    temperature=0.15,                   # Low temperature to control randomness, resulting in less varied output  
    early_stopping=True
)

# Decode the decoder output using tokenizer
baseline_text = tokenizer_bart.decode(baseline_outputs[0], skip_special_tokens=True)

print('Original Text:')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'> {original_text}</p>"))
print('\n')

print('Masked Text:')
# Highlight the <mask> tokens in the original text
highlighted_original_text = input_text.replace("<mask>", "<span style='color:blue; font-weight:bold;'>&lt;mask&gt;</span>")
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'>{highlighted_original_text}</p>"))
print('\n')

print('Completed Text:')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'>{baseline_text}</p>"))
print('\n')

Original Text:




Masked Text:




Completed Text:






## A good set of params
```python
baseline_outputs = model.generate(
    input_ids=None,                   # No input tokens are provided here, as we're feeding encoder outputs directly
    encoder_outputs=encoder_outputs,  # Encoded representations from the encoder
    max_length=300,                   # Set maximum length for the generated text sequence
    min_length=100,                    # Set minimum length for the generated text sequence
    do_sample=True,                   # Enables sampling for diverse outputs, rather than greedy decoding
    num_beams=15,
    temperature=1.5, or temperature = 0.15
    early_stopping=True
)
```

10_samples.ipynb:
model.generate(input_ids=None, encoder_outputs=modified_encoder_outputs, max_length=300, min_length=100, 
                                      num_beams=5, early_stopping=True)

Greedy Search:
output = model.generate(
    input_ids=input_ids,
    max_length=50,
    num_beams=1,  # Greedy Search uses only one beam
    do_sample=False  # Turn off sampling
)

Beam Search:
output = model.generate(
    input_ids=input_ids,
    max_length=50,
    num_beams=5,  # Set the number of beams
    do_sample=False  # Disable sampling for deterministic Beam Search
)

In [None]:
To do: Greedy search（类）在高temperature有没有生成不完整的问题？
BART-large