## 1. Import Libraries

In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput
from IPython.display import display, HTML
import torch


## 2. Choose Device (GPU / CPU)

tensor([1.], device='mps:0')


In [39]:
# Automatically choose (prefer NVIDIA GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # OR choose device manually, be sure to comment other codes relevant to `device`
# device = torch.device("cuda")
# device = torch.device("cpu")


## 3. Load BART model and tokenizer

In [3]:
# Specify model name
# model_name = "facebook/bart-base"
model_name = "facebook/bart-large" # Recommend this one if your computer is okay with larger models

tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)


## 4. A function adding AWGN noise to latent representation

In [4]:
def add_noise_with_snr(encoder_output, target_snr_db):
    """
    Add noise to the encoder output based on a target SNR in dB.
    
    Parameters:
    - encoder_output: torch.Tensor, the encoder's output (last_hidden_state).
    - target_snr_db: float, the desired signal-to-noise ratio in dB.
    
    Returns:
    - noisy_encoder_output: torch.Tensor, encoder output with added noise.
    """
    # Convert SNR from dB to linear scale
    target_snr_linear = 10 ** (target_snr_db / 10)
    
    # Calculate power of the signal
    signal_power = torch.mean(encoder_output ** 2)
    
    # Calculate required noise power for the target SNR
    noise_power = signal_power / target_snr_linear
    noise = torch.randn_like(encoder_output) * torch.sqrt(noise_power)
    
    # Add noise to the encoder output
    noisy_encoder_output = encoder_output + noise
    return noisy_encoder_output


## 5. Three example encoder inputs for `fill in the blank` task

`original_text` contains the complete text <br>
`input_text` contains the masked text

In [5]:
original_text = """
Beginners BBQ Class Taking Place in Missoula! 
Do you want to get better at making delicious BBQ?
You will have the opportunity, put this on your calendar now. 
Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. 
He will be teaching a beginner level class for everyone who wants to get better with their culinary skills. 
He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information. 
The cost to be in the class is $35 per person, and for spectators it is free. 
Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
"""
input_text = """
Beginners BBQ Class <mask> in Missoula! 
Do you want to <mask> making delicious BBQ?
You will have the opportunity, put this on your calendar now. 
Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. 
He will be teaching a beginner level class for everyone who wants to get better with their culinary skills. 
He will teach you <mask> compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information. 
The <mask> the class is $35 per person, and for spectators it is free. 
Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
""".replace("\n", "")

# original_text = """
# This November, embark on an exciting hiking adventure! 
# Explore the scenic mountain trails with an experienced guide, who will show you the best routes and hidden viewpoints. 
# This journey is suitable for all levels, from beginners to advanced hikers. 
# The hike covers approximately 10 miles and includes multiple rest stops with breathtaking views. 
# Participants should bring water, snacks, and comfortable hiking shoes. 
# The cost of the trip is $60, which includes a map and a group photo.
# """
# input_text = """
# This November, embark on an exciting <mask> adventure! 
# Explore the scenic mountain trails with an experienced guide, who will show you the best routes and hidden <mask>. 
# This journey is suitable for all levels, from beginners to advanced <mask>. 
# The hike covers approximately 10 miles and includes multiple rest stops with breathtaking <mask>. 
# Participants should bring water, snacks, and comfortable hiking shoes. 
# The <mask> is $60, which includes a map and a group photo.
# """.replace("\n", "")

# original_text = """
# Welcome to our online coding bootcamp program! 
# Whether you're a complete beginner or looking to improve your programming skills, this course is designed for you. 
# Throughout the course, you will learn essential coding languages such as Python and JavaScript. 
# Our instructors will guide you through interactive projects and provide real-time feedback. 
# Each student will receive a certificate of completion at the end of the program. 
# The total cost for the bootcamp is $150, which includes all learning materials.
# """
# input_text = """
# Welcome to our online <mask> bootcamp program! 
# Whether you're a complete beginner or looking to <mask> your programming skills, this course is designed for you. 
# Throughout the course, you will learn essential <mask> such as Python and JavaScript. 
# Our instructors will guide you through interactive projects and provide real-time <mask>. 
# Each student will receive a certificate of completion at the end of the <mask>. 
# The total cost for the bootcamp is $150, which <mask> all learning materials.
# """.replace("\n", "")

## 6. Pass the `input_text` through the LLM

### 6.1 Tokenize the `input_text` to tokens (integer numbers)

In [18]:
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)


### 6.2 Get encoder output

In [8]:
with torch.no_grad():
    encoder_outputs = model.model.encoder(input_ids=input_ids)

### 6.3.1 Case 1: clean latent reprenstation (without noise)

In [9]:
# # Generate output with the clean encoder output (latent reprenstation)
baseline_outputs = model.generate(
    input_ids=None,                   # No input tokens are provided here, as we're feeding encoder outputs directly
    encoder_outputs=encoder_outputs,  # Encoded representations from the encoder
    max_length=200,                   # Set maximum length for the generated text sequence
    min_length=10,                    # Set minimum length for the generated text sequence
    do_sample=True,                   # Enables sampling for diverse outputs, rather than greedy decoding
    temperature=0.1                   # Low temperature to control randomness, resulting in less varied output
)


# Decode the decoder output using tokenizer
baseline_text = tokenizer.decode(baseline_outputs[0], skip_special_tokens=True)


### 6.3.2 Case 2: noisy latent reprenstation

In [10]:
# Add noise with a target SNR and generate noisy output (latent reprenstation)
target_snr = 3  # Set target SNR
noisy_encoder_output = add_noise_with_snr(encoder_outputs.last_hidden_state, target_snr) # Add noise
modified_encoder_outputs = BaseModelOutput(last_hidden_state=noisy_encoder_output) # There are slight differences between `noisy_encoder_output` and `modified_encoder_outputs`, print them for more info

# Generate output with the noisy encoder output
noisy_outputs = model.generate(
    input_ids=None,                            # No input tokens are provided here, as we're feeding encoder outputs directly
    encoder_outputs=modified_encoder_outputs,  # Encoded representations from the encoder
    max_length=200,                            # Set maximum length for the generated text sequence
    min_length=10,                             # Set minimum length for the generated text sequence
    do_sample=True,                            # Enables sampling for diverse outputs, rather than greedy decoding
    temperature=0.1                            # Low temperature to control randomness, resulting in less varied output
)
# Decode the decoder output using tokenizer
noisy_text = tokenizer.decode(noisy_outputs[0], skip_special_tokens=True)


## 7. Display original texts and both outputs 

In [29]:
print('Original Text:')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'> {original_text}</p>"))
print('\n')

print('Without Noise:')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'>{baseline_text}</p>"))
print('\n')


print(f'With Noise (SNR = {target_snr} dB):')
display(HTML(f"<p style='font-size:15px; font-family:\"Comic Sans MS\", cursive;'>{noisy_text}</p>"))
print('\n')

Original Text:




Without Noise:




With Noise (SNR = 3 dB):






In [14]:
print(baseline_text)

Beginners BBQ Class at Lonestar Smoke Rangers in Missoula!Do you love BBQ? Do you want to learn more about making delicious BBQ?You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ


## You can also display some of the variables you find interesting

In [15]:
# `input_text` after tokenization
print(input_ids)

# its shape
print(input_ids.shape)

tensor([[    0, 48290,  7130, 22658,  4210, 50264,    11,  4523,  5156,   102,
           328,  1832,    47,   236,     7, 50264,   442, 10964, 22658,   116,
          1185,    40,    33,     5,   945,     6,   342,    42,    15,   110,
          7127,   122,     4,   296,     6,   772,   820,  1187,  1962,   623,
          4210, 22658, 10078,     6,  3621,  4317,   857,    31,   226, 28180,
           271, 21389,  5706,     4,    91,    40,    28,  5307,    10, 37239,
           672,  1380,    13,   961,    54,  1072,     7,   120,   357,    19,
            49, 16820,  2417,     4,    91,    40,  6396,    47, 50264,  3511,
            11,    10,   229,  8949, 22658,  1465,     6,   217,  7373,     6,
         13204,     6, 31583,     6,  4884,  4230,     8, 10723,  7059,     6,
          2704, 40345,     8,   668,   335,     4,    20, 50264,     5,  1380,
            16,    68,  2022,   228,   621,     6,     8,    13, 17596,    24,
            16,   481,     4, 30411,    11,     5,  

In [12]:
# latent representation (context vectors)
print(encoder_outputs)

# its shape
print(encoder_outputs.last_hidden_state.shape)

BaseModelOutput(last_hidden_state=tensor([[[ 1.7068e-03,  3.7224e-02,  2.8920e-02,  ...,  3.5600e-03,
           7.9942e-03, -3.4197e-04],
         [ 1.6971e-01,  1.2020e-01,  3.8075e-02,  ..., -1.9527e-01,
          -5.2274e-02,  2.9820e-02],
         [ 5.0318e-02,  6.8451e-02,  5.2653e-01,  ...,  2.0947e-02,
          -2.6133e-01,  1.0611e-01],
         ...,
         [-3.0117e-02, -5.0357e-01, -2.2870e-01,  ..., -1.3845e-01,
          -9.2816e-02,  6.2812e-02],
         [ 7.6866e-04,  1.6385e-02,  6.8835e-03,  ...,  4.4163e-03,
          -5.5813e-03, -1.0422e-03],
         [ 1.1516e-01,  2.9140e-01,  6.6931e-02,  ..., -3.7684e-02,
          -8.0240e-02,  4.4079e-02]],

        [[ 1.7068e-03,  3.7224e-02,  2.8920e-02,  ...,  3.5600e-03,
           7.9942e-03, -3.4197e-04],
         [ 1.6971e-01,  1.2020e-01,  3.8075e-02,  ..., -1.9527e-01,
          -5.2274e-02,  2.9820e-02],
         [ 5.0318e-02,  6.8451e-02,  5.2653e-01,  ...,  2.0947e-02,
          -2.6133e-01,  1.0611e-01],
     

In [14]:
# The output of decoder before tokenization
print(baseline_outputs)

# its shape
print(baseline_outputs.shape)

tensor([[    2,     0, 48290,  7130, 22658,  4210,    23,   226, 28180,   271,
         21389,  5706,    11,  4523,  5156,   102,   328,  8275,    47,   657,
         22658,   116,  1832,    47,   236,     7,  1532,    55,    59,   442,
         10964, 22658,   116,  1185,    40,    33,     5,   945,     6,   342,
            42,    15,   110,  7127,   122,     4,   296,     6,   772,   820,
          1187,  1962,   623,  4210, 22658,     2]])
torch.Size([1, 56])


## How `tokenizer` works
Since we will be likely use the `Falconsai/text_summarization` model as our text summarizer. It is better to explore the tokenizer for T5 model

In [14]:
# load tokenizer for T5
from transformers import AutoTokenizer
import torch

tokenizer_Fal = AutoTokenizer.from_pretrained("Falconsai/text_summarization")

In [15]:
text = "I am a Master's student in Information and Networking Engineering at KTH."

# Tokenize the above text
tokenized_text = tokenizer_Fal(text, return_tensors="pt")

# By decoding the tokenized text you reconstruct the original text
decoded_text = tokenizer_Fal.decode(tokenized_text['input_ids'][0], skip_special_tokens=True)

In [16]:
print(text)
print(tokenized_text['input_ids'][0])
print(decoded_text)

I am a Master's student in Information and Networking Engineering at KTH.
tensor([  27,  183,    3,    9, 3226,   31,    7, 1236,   16, 2784,   11, 3426,
          53, 5623,   44,  480, 4611,    5,    1])
I am a Master's student in Information and Networking Engineering at KTH.


In [17]:
# Changing the values of tokenized_text with change the decoded text
tokenized_text['input_ids'][0][0] = 6
tokenized_text['input_ids'][0][1] = 66
tokenized_text['input_ids'][0][-1] = 666
decoded_text_1 = tokenizer_Fal.decode(tokenized_text['input_ids'][0], skip_special_tokens=True)
print(decoded_text_1)

, all a Master's student in Information and Networking Engineering at KTH. below


## Text Similarity

In [27]:
def get_embedding(text):
     #function to get the embeddings
    
    inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        
        outputs = model.model.encoder(input_ids=inputs)
    # Mean pooling over the sequence length dimension
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.cpu()

def cosine_sim(embedding1, embedding2):
    
    # Cosine similarity
    similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
    return similarity.item()


#### Cosine similarity

In [47]:
embedding1 = get_embedding(original_text)
embedding2 = get_embedding(noisy_text)

similarity_score = cosine_sim(embedding1, embedding2)
print(similarity_score)



0.8178556561470032
