In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
from IPython.display import display, HTML
import torch
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from utils import *

In [2]:
# Automatically choose (prefer NVIDIA GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Specify model name
model_name = "facebook/bart-base"
tokenizer_bart = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

# Specify model name
summarizer_name = "Falconsai/text_summarization"
tokenizer_sum = T5Tokenizer.from_pretrained(summarizer_name)
summarizer = T5ForConditionalGeneration.from_pretrained(summarizer_name).to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
def add_noise_with_snr_new(encoder_output, noise_type='gaussian', target_snr_db=3, dropout_rate=0.4, sp_thresh=0.4):
    """
    Add noise to the encoder output based on a target SNR in dB.
    
    Parameters:
    - encoder_output: torch.Tensor, the encoder's output (last_hidden_state).
    - noise_type: string, determines what kind of noise is added.
    - target_snr_db: float, the desired signal-to-noise ratio in dB for awgn and dropout.
    - dropout_rate: float, range: [0,1], default rate for dropout noise (not used here).
    - sp_thresh: float, range: [0,1], determines the threshold for salt-and-pepper noise.
    
    Returns:
    - noisy_encoder_output: torch.Tensor, encoder output with added noise.
    """
    signal_power = torch.mean(encoder_output ** 2)
    target_snr = 10 ** (target_snr_db / 10)
    noise_power = signal_power / target_snr

    if noise_type.lower() == 'gaussian':
        # Generate Gaussian noise
        noise = torch.randn_like(encoder_output) * torch.sqrt(noise_power)
        return encoder_output + noise

    elif noise_type.lower() == 'dropout':
        # Compute dropout probability p based on SNR
        signal_power = torch.mean(encoder_output ** 2)
        target_snr_linear = 10 ** (target_snr_db / 10)
        noise_power = signal_power / target_snr_linear
        p = 1 / target_snr_linear

        # Create a mask with elements set to zero with probability p
        random_tensor = torch.rand_like(encoder_output)
        mask = random_tensor >= p  # Retain elements with probability (1 - p)

        # Apply the mask to the encoder output without scaling
        noisy_encoder_output = encoder_output * mask.float()
        return noisy_encoder_output

    elif noise_type.lower() == 'saltpepper':
        mask = torch.rand_like(encoder_output) < sp_thresh  # The greater the sp_thresh, more noise is added
        salt = torch.max(encoder_output)
        pepper = torch.min(encoder_output)
        noise = torch.where(torch.rand_like(encoder_output) < 0.5, salt, pepper)
        noised_enc_output = torch.where(mask, noise, encoder_output)
        return noised_enc_output

    else:
        raise ValueError("Unsupported Noise Type. Choose between 'gaussian', 'dropout', 'saltpepper'.")


In [4]:
original_text = """
Beginners BBQ Class Taking Place in Missoula! 
Do you want to get better at making delicious BBQ?
You will have the opportunity, put this on your calendar now. 
Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. 
He will be teaching a beginner level class for everyone who wants to get better with their culinary skills. 
He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information. 
The cost to be in the class is $35 per person, and for spectators it is free. 
Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
""".replace("\n", "")

In [5]:
input_text = masking(original_text, 0.05)
input_text

'Beginners BBQ Class Taking Place in Missoula! Do you want to get better at making delicious BBQ?You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will <mask> teaching a beginner level class for everyone who wants to get better with their culinary <mask> He will teach you everything you need <mask> know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and <mask> plus smoker and fire information. The <mask> to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each <mask> that is prepared.'

In [13]:
input_ids = tokenizer_bart(input_text, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
    encoder_outputs = model.model.encoder(input_ids=input_ids)

target_snr = 50
# add noise
noisy_encoder_output = add_noise_with_snr_new(
    encoder_output = encoder_outputs.last_hidden_state,
    noise_type = "gaussian",
    target_snr_db = target_snr,
    dropout_rate = 0,
    sp_thresh = 0
)
modified_encoder_outputs = BaseModelOutput(last_hidden_state=noisy_encoder_output)
print(noisy_encoder_output)

# first LLM
noisy_outputs = model.generate(input_ids=None, encoder_outputs=modified_encoder_outputs, max_length=300, min_length=100, 
                            num_beams=15, do_sample=True, temperature=1.5, early_stopping=True)
noisy_text = tokenizer_bart.decode(noisy_outputs[0], skip_special_tokens=True)
print(noisy_text)

# second LLM
noisy_ids = tokenizer_sum('Summarize:' + noisy_text, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
    noisy_encoder_outputs = summarizer.encoder(input_ids=noisy_ids)
noisy_sum_output = summarizer.generate(input_ids=None, encoder_outputs=noisy_encoder_outputs, max_length=70, output_hidden_states=True, 
                                       num_beams=15, return_dict_in_generate=True, do_sample=True, temperature=0.1)
noisy_summary = tokenizer_sum.decode(noisy_sum_output.sequences[0], skip_special_tokens=True)
print(noisy_summary)

tensor([[[-3.3971e-02,  7.4056e-03, -2.3095e-03,  ...,  1.1116e-02,
          -3.7404e-04, -8.5138e-03],
         [ 9.9033e-02,  7.6711e-02, -4.8507e-03,  ..., -4.1736e-01,
          -4.1664e-01,  1.8059e-01],
         [-1.1457e-01,  9.9002e-02, -2.5445e-01,  ..., -3.9337e-01,
           9.2368e-03, -4.2570e-01],
         ...,
         [-2.4610e-01,  1.4487e-01, -3.4231e-02,  ..., -2.2883e-01,
          -5.8550e-03,  1.7179e-02],
         [ 2.2376e-01,  8.4899e-03,  2.1352e-01,  ...,  1.9566e-02,
          -3.0446e-01,  5.8538e-02],
         [ 1.8972e-01, -2.7188e-02,  4.6234e-01,  ...,  1.6946e-02,
          -3.5427e-01,  2.8271e-01]]], device='cuda:0')
Beginners BBQ Class Taking Place in Missoula! Do you want to get better at making delicious BBQ?You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get good with 

In [7]:
noisy_encoder_output.shape


torch.Size([1, 159, 768])

In [9]:
# rand_encoder_output = 2 * torch.rand([1, 100, 1024]) - 1
# rand_encoder_output = torch.ones([1, 100, 1024])
# rand_encoder_output = - torch.ones([1, 100, 1024])
rand_encoder_output = torch.zeros([1, 100, 1024])
rand_encoder_output = rand_encoder_output.cuda()
modified_encoder_outputs = BaseModelOutput(last_hidden_state=rand_encoder_output)
print(rand_encoder_output)

# first LLM
noisy_outputs = model.generate(input_ids=None, encoder_outputs=modified_encoder_outputs, max_length=300, min_length=100, 
                            num_beams=15, do_sample=True, temperature=1.5, early_stopping=True)
noisy_text = tokenizer_bart.decode(noisy_outputs[0], skip_special_tokens=True)
print(noisy_text)

# second LLM
noisy_ids = tokenizer_sum(noisy_text, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
    noisy_encoder_outputs = summarizer.encoder(input_ids=noisy_ids)
noisy_sum_output = summarizer.generate(input_ids=None, encoder_outputs=noisy_encoder_outputs, max_length=70, output_hidden_states=True,
                                    return_dict_in_generate=True, do_sample=True, temperature=0.1)
noisy_summary = tokenizer_sum.decode(noisy_sum_output.sequences[0], skip_special_tokens=True)
print(noisy_summary)

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0')
The second is that there is no such thing as a free lunch. The third is that it is extremely expensive.Finally, there is the fourth. The fourth is that I don't believe in free lunches. I believe in a healthy diet.I'm not a fan of the term "free lunch." I think it's a misnomer. It's more like "free" than "free."The fifth is the fact that I'm not sure what to call it. I just know it's not free.The sixth is that my children are growing up. I'm getting older. I can't help it.I've been doing this for years. Now, I've got to go.The seventh is that we're going to be able to talk.I'll be honest. I didn't want to. I was afraid. I wanted to.But I did.And now.It's time.Here's the thing. We've been here before.In fact, we'v