In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput
from IPython.display import display, HTML
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

In [4]:
original_text = """
Welcome to our online coding bootcamp program! 
Whether you're a complete beginner or looking to improve your programming skills, this course is designed for you. 
Throughout the course, you will learn essential coding languages such as Python and JavaScript. 
Our instructors will guide you through interactive projects and provide real-time feedback. 
Each student will receive a certificate of completion at the end of the program. 
The total cost for the bootcamp is $150, which includes all learning materials.
"""

input_text = """
Welcome to our online <mask> bootcamp program! 
Whether you're a complete beginner or looking to <mask> your programming skills, this course is designed for you. 
Throughout the course, you will learn essential <mask> such as Python and JavaScript. 
Our instructors will guide you through interactive projects and provide real-time <mask>. 
Each student will receive a certificate of completion at the end of the <mask>. 
The total cost for the bootcamp is $150, which <mask> all learning materials.
""".replace("\n", "")

In [5]:
# return_tensors returns the tokenizer output as pytorch tensors
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
print(input_ids.shape)

torch.Size([1, 94])


In [6]:
with torch.no_grad():
    encoder_outputs = model.model.encoder(input_ids=input_ids)

print(encoder_outputs.last_hidden_state.shape)

torch.Size([1, 94, 1024])


### No Noise to Encoder Output

In [8]:
no_noise_outputs = model.generate(
    input_ids = None,
    encoder_outputs = encoder_outputs,
    max_length = 200,
    min_length = 10,
    do_sample = True,
    temperature = 0.1
)

print(no_noise_outputs[0])

tensor([    2,     0, 25194,     7,    84,   804,  8326,  9759, 21669,   586,
          328,  5994,    47,   214,    10,  1498, 37239,    50,   546,     7,
         1477,   110,  8326,  2417,     6,    42,   768,    16,  1887,    13,
           47,     4, 13231,     5,   768,     6,    47,    40,  1532,  4499,
         8326,  2417,   215,    25, 31886,     8, 18434,     4,  1541, 25508,
           40,  4704,    47,   149, 10813,  1377,     8,   694,   588,    12,
          958,     4,  4028,  1294,    40,  1325,    10, 10921,     9,  5687,
           23,     5,   253,     9,     5,     4,    20,   746,   701,    13,
            5,  9759, 21669,    16,    68,  6115,     6,    61,  1171, 12263,
            6,  2799,     6,     8,    70,  2239,  3183,     4,     2],
       device='cuda:0')


In [9]:
no_noise_text_output = tokenizer.decode(no_noise_outputs[0], skip_special_tokens=True)
print(no_noise_text_output)

Welcome to our online programming bootcamp program! Whether you're a complete beginner or looking to improve your programming skills, this course is designed for you. Throughout the course, you will learn essential programming skills such as Python and JavaScript. Our instructors will guide you through interactive projects and provide real-time. Each student will receive a certificate of completion at the end of the. The total cost for the bootcamp is $150, which includes tuition, books, and all learning materials.


### Simple AWGN addition

In [10]:
def add_wgn(latent, snr_dB):
    snr_lin = 10 ** (snr_dB / 10)
    signal_power = torch.mean(latent ** 2)

    noise_power = signal_power / snr_lin
    noise = torch.randn_like(latent) * torch.sqrt(noise_power)

    noised_latent = latent + noise
    return noised_latent

In [16]:
target_snr = 5
noise_encoder_output = add_wgn(encoder_outputs.last_hidden_state, target_snr)
base_model_noised_encoder_output = BaseModelOutput(last_hidden_state=noise_encoder_output)

noise_outputs = model.generate(
    input_ids = None,
    encoder_outputs = base_model_noised_encoder_output,
    max_length = 200,
    min_length = 10,
    do_sample = True,
    temperature = 0.1
)

print(noise_outputs[0])

tensor([    2,     0, 25194,     7,    84,   804, 25776,  9759, 21669,   586,
          328,  5994,    47,   214,    10,  1498, 37239,    50,   546,     7,
         1477,   110,  8326,  2417,     6,    42,   768,    16,  1887,    13,
           47,     4,  1541, 25508,    40,  4704,    47,   149, 10813,  1377,
            8,   694,   588,    12,   958,     4, 13231,     5,   768,     6,
           47,    40,  1532,  4499, 25776,  2417,   215,    25, 31886,     8,
        18434,     4,  4028,  1294,    40,  1325,    10, 10921,     9,  5687,
           23,     5,   253,     9,     5,   768,     4,    20,   746,   701,
           13,     5,  9759, 21669,    16,    68,  6115,     6,    61,  1171,
            5,   701,     9,  3183,     8,    70,  2239,  3183,     4,     2],
       device='cuda:0')


In [17]:
noise_text_output = tokenizer.decode(noise_outputs[0], skip_special_tokens=True)
print(noise_text_output)

Welcome to our online coding bootcamp program! Whether you're a complete beginner or looking to improve your programming skills, this course is designed for you. Our instructors will guide you through interactive projects and provide real-time. Throughout the course, you will learn essential coding skills such as Python and JavaScript. Each student will receive a certificate of completion at the end of the course. The total cost for the bootcamp is $150, which includes the cost of materials and all learning materials.


### New BartModel class for Noising

In [7]:
import torch
import torch.nn.functional as F

In [8]:
class NoisyBART(BartForConditionalGeneration):
    def __init__(self, config):
        super.__init__(config)
    
    def add_noise(self, encoder_output, noise_type='gaussian', target_snr_db=3):
        signal_power = torch.mean(encoder_output ** 2)

        target_snr = 10 ** (target_snr_db / 10)
        noise_power = signal_power / target_snr

        if noise_type.lower() == 'gaussian':
            noise = torch.rand_like(encoder_output) * torch.sqrt(noise_power)
            return encoder_output + noise
        
        elif noise_type.lower() == 'dropout':
            dropout_rate = 1 - (1 / (1 + target_snr))
            dropped_enc_output = F.dropout(encoder_output, p=dropout_rate, training=self.training)
            return dropped_enc_output
        
        elif noise_type.lower() == 'laplacian':
            scaled_noise = torch.sqrt(noise_power / 2)
            noise = torch.distributions.Laplace(0, scaled_noise).sample(encoder_output.shape).to(device)
            return encoder_output + noise
        
        elif noise_type.lower() == 'saltpepper':
            mask = torch.rand_like(encoder_output) < (1 / (1 + target_snr))
            salt = torch.max(encoder_output)
            pepper = torch.min(encoder_output)
            noise = torch.where(mask, torch.rand_like(encoder_output) < 0.5, encoder_output)
            return torch.where(noise != encoder_output, torch.where(noise == 0, pepper, salt), encoder_output)
        
        else:
            raise ValueError("Unsupported Noise Type. Choose between 'gaussian', 'dropout', 'laplacian', 'saltpepper'.")
        
    def forward(self, input_ids, attention_mask=None, noise_type='gaussian', target_snr_db=3, **kwargs):
        encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask)
        
        # Add noise to encoder output
        noisy_encoder_outputs = self.add_noise(encoder_outputs[0], noise_type, target_snr_db)
        
        # Pass noisy encoder output to decoder
        decoder_outputs = self.model.decoder(
            input_ids=input_ids, 
            encoder_hidden_states=noisy_encoder_outputs,
            attention_mask=attention_mask,
            **kwargs
        )
        
        return decoder_outputs

### Displaying the Text Outputs

In [18]:
print('Original Text:')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'> {original_text}</p>"))
print('\n')

print('Without Noise:')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'>{no_noise_text_output}</p>"))
print('\n')

print(f'With {target_snr} dB Noise:')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'>{noise_text_output}</p>"))
print('\n')

Original Text:




Without Noise:




With 5 dB Noise:






**Note: Masked tokens which are at end of the sentence are not being filled in whereas mid of sentence tokens are**