In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# For viewing and manipulating data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Importing the necessary libraries

import re
import math
import torch
import spacy
import string
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gensim.downloader as api
import matplotlib.pyplot as plt

# Getting particular functions from these libraries 
from scipy import stats
from torch import Tensor
from sklearn.utils import resample
from gensim.models import KeyedVectors # >> alternative to gensim.downloader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import random_split, DataLoader, TensorDataset, Dataset

# Using the NLTK to tokenize the text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Checks if a CUDA enabled GPU is available and prints out its information
if torch.cuda.is_available():
    print("CUDA is available!")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        
    device = torch.device("cuda:0")

else:
    print("CUDA is not available.")
    device = torch.device("cpu")

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
reviews_file = ''
w2v_file = ''
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_name = os.path.join(dirname, filename)
        if file_name.endswith('.csv'): 
            reviews_file = file_name
        elif file_name.endswith('.bin') or ('.gz'):
            w2v_file = file_name
        else:
            print(f'Found unexpected file: {file_name}')
                
print(f'Amazon reviews file: {reviews_file}')
print(f'Google news word to vec file: {w2v_file}')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
VERBOSE = False
def printv(text):
    if VERBOSE: print('VERBOSE:', text)
    return

def showV(text):
    '''unconditional verbose output'''
    print('VERBOSE:', text)
    return

DEV = True
def printd(text):
    if DEV: print('DEV:', text)
    return

def showD(text):
    '''unconditional DEV output'''
    print('VERBOSE:', text)
    return

showCellCompletion = False
def showC(text):
    if showCellCompletion:
        print('Cell complete:', text)
    return

import subprocess
showNv = False
accelerator = False

def printNv():
    if not showNv or not accelerator: return
    mem_usage = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(mem_usage.stdout.decode('utf-8'))

showMemoryAllocation = True
def printM():
    if not showMemoryAllocation: return
    print(f"Total allocated memory: {torch.cuda.memory_allocated()} bytes")

In [None]:
pip3 install manim

from manim import *

class AttentionScoresPlot(Scene):
    def construct(self):
        scores = [0.2, 0.4, 0.6, 0.8, 0.5]
        labels = ['Score 1', 'Score 2', 'Score 3', 'Score 4', 'Score 5']
        
        chart = BarChart(values=scores, bar_names=labels, y_range=[0, 1, 0.2], y_length=5, x_length=10)
        self.play(Create(chart))
        self.wait()
        
!manim -qm AttentionScoresPlot

from IPython.display import Video
Video('media/videos/scene/480p15/AttentionScoresPlot.mp4', embed=True)

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
# Load data from CSV
#>>  3/13 needed to change the path as below
# path ='/kaggle/input/Reviews.csv'#"/kaggle/input/amazon-product-reviews/Reviews.csv"
data = pd.read_csv(reviews_file) # Use pandas to analyze data
showD('Amazon reviews loaded into Panda')

# print number of rows in our ratings column
printv(f'Number of reviews: {len(data["Score"])}')
printv(f'Column names -\n {data.columns}\n') 
printv(f'First five rows -\n{data.head()}')

# Check for empty entries or missing data in each column
for column in data.columns:
    if data[column].isnull().values.any():
        print(f"Column '{column}' has empty entries or missing data.")
    else:
        print(f"Column '{column}' has no empty entries or missing data.")

# Get count of ratings 
rating_counts = data['Score'].value_counts()

# Sort counts by index ascending
rating_counts = rating_counts.sort_index()  

# Create bar plot
ax = rating_counts.plot(kind = 'bar')

ax.set_title("Ratings Distribution")
ax.set_xlabel("Rating")
ax.set_ylabel("Number of Occurrences")

# Fix x-axis tick labels
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0) 

for rating, count in rating_counts.items():
        print(f"{count:,} samples from balanced data with rating {rating}\n")

plt.show() #<< show the rating in each of the 5 categories

balanced_data_size = 100 #25000<< number of reviews in each rating category, tailored for CPU capacity

# Specify the column for sorting and balancing
sort_column = 'Score'  # This is one the rating column

# Sort the data by the rating values
sorted_data = data.sort_values(by = sort_column)

# Create a balanced dataset with 25,000 samples from each class
#balanced_data = sorted_data.groupby(sort_column).apply(lambda x: x.sample(n=25000))

#>> DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. 
#>> This behavior is deprecated, and in a future version of pandas the grouping 
#>> columns will be excluded from the operation. 
#>> Either pass `include_groups=False` to exclude the groupings or 
#>> explicitly select the grouping columns after groupby to silence this warning.
#
balanced_data = sorted_data.groupby(sort_column).apply(lambda x: x.sample(n = balanced_data_size))

#>> Does this mean to reset the row numbers?? ##Columns Numbers
balanced_data.reset_index(drop = True, inplace = True)

printv(f"The number of reviews equally distributed across all ratings is {len(balanced_data['Score'])}")

# Get count of ratings
rating_counts = balanced_data['Score'].value_counts()

# Create bar plot
ax = rating_counts.plot(kind='bar')

ax.set_title("Ratings Distribution After Balancing")
ax.set_xlabel("Rating")
ax.set_ylabel("Number of Samples")

# Fix x-axis ticks  
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)

# Print number of reviews per rating

if DEV:
    for rating, count in rating_counts.items():
        print(f"{count:,} samples from balanced data with rating {rating}\n")

plt.show()

# Load the English tokenizer model
nlp = spacy.load("en_core_web_sm")

# Compile the regular expressions
html_tags = re.compile("<.*?>")
punct_and_symbols = re.compile(r'[^\w\s\']+')

# custom stop words to remove
custom_stop_words = ['a', 'of']

def tokenizer(text, nlp):
    """
    Tokenizes a text string and removes stop words.
    
    text (str): The text string to tokenize.
    nlp: The Spacy language model.

    Returns:
        list: The tokenized text string.
    """
    text = text.lower()  # Convert text to lowercase
    text = html_tags.sub("", text)  # Remove HTML tags
    text = punct_and_symbols.sub("", text)  # Remove punctuation and symbols

    # Tokenize the text
    doc = nlp(text)

    # Remove stop words and punctuation tokens
    tokens = [token.text for token in doc if not token.is_punct 
              and token.text not in custom_stop_words]

    return tokens

# Assuming balanced_data is a pandas DataFrame and 'Text' is a column containing the reviews
# Apply the function to the Text column and store it in a new column
balanced_data['CleanedReview'] = balanced_data['Text'].apply(lambda x: tokenizer(x, nlp))

# Show that cell has finished executing
showC(f'{tokenizer} defined, and then used to create CleanedReview column')

# Store the Rating column
rating = balanced_data['Score']  

# Store the CleanedReview column
tokenized_review = balanced_data['CleanedReview']
showD(f'specify the columns that will be used to train the classifier')

print(balanced_data['Text'][0],'\n')
print(tokenized_review[0], '\n')
print(len(tokenized_review[0]), '\n')
print(tokenized_review.shape)

def analyze_review_tokens(reviews):
    token_counts = [len(review) for review in reviews]

    mean = np.mean(token_counts)
    median = np.median(token_counts)
    mode = stats.mode(token_counts)

    print(f"Mean: {mean:.2f}")
    print(f"Median: {median:.2f}")
    print(f"Mode: {mode}")

    plt.figure(figsize=(8, 6))
    plt.hist(token_counts, bins=20, edgecolor='black')
    plt.xlabel('Number of Tokens')
    plt.ylabel('Frequency')
    plt.title('Histogram of Token Counts per Review')
    plt.grid(True)
    plt.show()

analyze_review_tokens(tokenized_review)

# >> 3/29/24 Took less than a minute to run without the accelerator.
# Load Word2Vec model
#w2v = api.load('word2vec-google-news-300')
w2v = KeyedVectors.load_word2vec_format (w2v_file, binary=True)

# Define the aimum sequence length (adjust as needed)
#>> Will increasing max_sequence_length impact performance?
max_sequence_length = 100

showD(f'{w2v} can map words onto vectors with 300 dimensions')


#>> 3/29/24 Completed in less than 30 seconds using 1347 as max_sequence_length 

# Assume we have a list of tokenized review called tokenized_review
# Each element in tokenized_review is a list of tokens for a single review

lengths = []
for review_tokens in tokenized_review:
    lengths.append(len(review_tokens))

lengths = sorted(lengths)
lengths = lengths[-1:0:-1]
showD(f'Lengths of 100 longest reviews: {lengths[0:100]}')        

max_sequence_length = 100 #<< 3/29/24 1347 was the longest review length in sample

# Initialize an empty tensor for padded reviews on the GPU
padded_reviews = torch.zeros((len(tokenized_review), max_sequence_length, 300))

out_words = {}
words_in = 0
words_out = 0

# padding and embedding conversion loop
for i, review_tokens in enumerate(tokenized_review):
    review_length = min(len(review_tokens), max_sequence_length)
    for j in range(review_length):
        word = review_tokens[j]
        if word in w2v:
            words_in += 1
            # Use Word2Vec vector if available
            padded_reviews[i, j, :] = torch.tensor(w2v[word])
        else:
            words_out += 1
            out_words[word] = out_words.get(word, 0) + 1

printv(f'{words_in} words found with vector representations, {words_out} without')
printv(f'Number of unique words without vector representations: {len(out_words)}')
printd('Sample of words without vector representations')
kwords = list(out_words.keys())
for idx in range(0,len(kwords)-1,100):
    kword = kwords[idx]
    printd(f'{kword} - {out_words[kword]}')
# Apply max pooling to aggregate embeddings along the sequence dimension
# review_embeddings = torch.max(padded_reviews, dim=1)[0]

# Now,review_embeddings contains the aggregated Word2Vec 
# embeddings for each review on the GPU

showC(f"Created zero-padded, standard length reviews")

print(padded_reviews[1])

#>> padded_reviews are 100 x 300 tensors, zero padded if necessary
#>> to get the standard lenth
text_embeddings_tensors = padded_reviews.to(device)

# Rating labels
rating_labels_tensors = torch.tensor(rating.values).to(device)

# Dataset
dataset = TensorDataset(text_embeddings_tensors, rating_labels_tensors)
showC(f'{dataset} defined')

# shape of the tensors
print(text_embeddings_tensors.shape)
print(rating_labels_tensors.shape)

# location of the tensors
print(text_embeddings_tensors.device)
print(rating_labels_tensors.device)

print(dataset)

# DataLoader for training data
data_loader = DataLoader(dataset, batch_size = 32, shuffle = True)  # Use shuffle for training

printM() # print(f"Total allocated memory: {torch.cuda.memory_allocated()} bytes")
printNv() #!nvidia-smi
print('\n\n\n\n\n\n\n\n\n\n')
if accelerator and showNv:
    print(torch.cuda.memory_summary())

# Hyperparameters

In [None]:
# HyperParameters for the module
d_model = 300  # Should match the embedding dimension of our word embeddings
seq_len = 100  # Maximum sequence length
input_size = d_model  # based on the output size of our feed-forward network

num_layers = 1 # 4 # Number of encoder layers
h       = 1 # 10   # number of attention head
d_ffn   = 2048 # dimension of the feedforward layer

dropout = 0.0 # 0.1  # we can adjust the dropout if needed
eps     = 1e-05 # epsilon value to prevent the standard deviation from becoming zero
learning_rate = 0.01

"""
To ensure compatibility, it's important to choose the 
number of attention heads (h) such that d_model is 
evenly divisible by h in the multi-head attention 
module's self.d_k. This allows for a clean distribution 
of the model dimensionality across the attention heads.
"""

# d_model / attn.h = 300 / 10 = 30

# Pytorch's Scaled Dot Product Attention

In [None]:
# FROM TORCH.NN.FUNCTIONAL.SCALED_DOT_PRODUCT_ATTENTION DOCS


# Efficient implementation equivalent to the following:
def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
    # Efficient implementation equivalent to the following:
    L, S = query.size(-2), key.size(-2)
    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
    attn_bias = torch.zeros(L, S, dtype=query.dtype)
    if is_causal:
        assert attn_mask is None
        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
        attn_bias.to(query.dtype)

    if attn_mask is not None:
        if attn_mask.dtype == torch.bool:
            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
        else:
            attn_bias += attn_mask
    attn_weight = query @ key.transpose(-2, -1) * scale_factor
    attn_weight += attn_bias
    attn_weight = torch.softmax(attn_weight, dim=-1)
    attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
    return attn_weight @ value

In [None]:
# Linear projection matrices (takes in 300 dimesions, outputs 300 dimensions)
W_k = nn.Linear(300, 300).to(device)
W_q = nn.Linear(300, 300).to(device)
W_v = nn.Linear(300, 300).to(device)

# Compute key, query, and value tensors
key_tensor   = W_k(text_embeddings_tensors) # shape: (batch_size, max_len, emb)
query_tensor = W_q(text_embeddings_tensors) # shape: (batch_size, max_len, emb)
value_tensor = W_v(text_embeddings_tensors) # shape: (batch_size, max_len, emb)

In [None]:
attn_weights, attn_output = scaled_dot_product_attention(
    query_tensor, key_tensor, value_tensor)

print(key_tensor[0])
print(query_tensor[0])
print(value_tensor[0])

print(text_embeddings_tensors[0])
print(attn_weights[0])
print(attn_output[0])


print("Attention Weights (shape:", attn_weights.shape, ")")
print("Attention Scores (shape:", attn_output.shape, ")")

GPT 4 Turbo
    
You can use hooks to retrieve the attention weights from PyTorch's `F.scaled_dot_product_attention` function before they are multiplied by the value tensor, but it requires a slightly different approach since `F.scaled_dot_product_attention` is a function, not a module. PyTorch hooks can only be directly attached to instances of `nn.Module`. To work around this, you can encapsulate the `F.scaled_dot_product_attention` function within a custom module and then attach a hook to this module.

Here’s how you can do it:

### Step 1: Create a Custom Module
First, you need to create a custom module that wraps the `F.scaled_dot_product_attention` function.

```python
import torch
import torch.nn.functional as F
import torch.nn as nn

class CustomScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, query, key, value, attn_mask=None):
        # Compute the raw attention scores
        attn_output_weights = torch.matmul(query, key.transpose(-2, -1))
        attn_output_weights = attn_output_weights / torch.sqrt(torch.tensor(key.size(-1), dtype=torch.float32))
        
        if attn_mask is not None:
            attn_output_weights += attn_mask
        
        # Apply softmax to get the probabilities
        attn_output_weights = F.softmax(attn_output_weights, dim=-1)

        # This is where you can use a hook to capture attn_output_weights before they are used to weight the values
        attn_output = torch.matmul(attn_output_weights, value)

        return attn_output, attn_output_weights
```

### Step 2: Define a Hook Function
Next, define a hook function that captures the attention weights.

```python
def attention_weights_hook(module, input, output):
    # output[1] contains the attention weights
    print("Attention Weights:", output[1].detach())
```

### Step 3: Instantiate the Custom Module and Register the Hook
Create an instance of your custom module and register the hook.

```python
# Create an instance of the custom attention module
attention_module = CustomScaledDotProductAttention()

# Register the hook
hook = attention_module.register_forward_hook(attention_weights_hook)

# Example tensors for query, key, and value
query = torch.rand(10, 32, 64)  # (sequence_length, batch_size, d_model)
key = torch.rand(10, 32, 64)
value = torch.rand(10, 32, 64)

# Optionally, define an attention mask
attn_mask = None  # Example: torch.zeros(10, 10)

# Forward pass through the custom module
output = attention_module(query, key, value, attn_mask)
```

### Step 4: Use and Remove the Hook
After using the hook to capture the attention weights during the forward pass, remember to remove it to avoid memory leaks.

```python
# Cleanup: remove the hook to prevent memory leaks
hook.remove()
```

### Conclusion
By wrapping the `F.scaled_dot_product_attention` function in a custom module, you maintain the flexibility of using hooks to capture intermediate data like the attention weights. This method leverages the modular nature of PyTorch and allows for detailed inspection and manipulation of the data flowing through neural networks. This can be particularly useful for research, debugging, and enhancing model interpretability.

In [None]:
class CustomScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, query, key, value, attn_mask=None):
        # Compute the raw attention scores
        attn_output_weights = torch.matmul(query, key.transpose(-2, -1))
        attn_output_weights = attn_output_weights / torch.sqrt(torch.tensor(key.size(-1), dtype=torch.float32))
        
        if attn_mask is not None:
            attn_output_weights += attn_mask
        
        # Apply softmax to get the probabilities
        attn_output_weights = F.softmax(attn_output_weights, dim=-1)

        # This is where you can use a hook to capture attn_output_weights before they are used to weight the values
        attn_output = torch.matmul(attn_output_weights, value)

        return attn_output, attn_output_weights

def attention_weights_hook(module, input, output):
    # output[1] contains the attention weights
    print("Attention Weights:", output[1].detach())

# Create an instance of the custom attention module
attention_module = CustomScaledDotProductAttention()

# Register the hook
hook = attention_module.register_forward_hook(attention_weights_hook)

# Example tensors for query, key, and value
query = torch.rand(10, 32, 64)  # (sequence_length, batch_size, d_model)
key = torch.rand(10, 32, 64)
value = torch.rand(10, 32, 64)

# Optionally, define an attention mask
attn_mask = None  # Example: torch.zeros(10, 10)

# Forward pass through the custom module
output = attention_module(query, key, value, attn_mask)

# Cleanup: remove the hook to prevent memory leaks
hook.remove()

GPT 4

You cannot use hooks to directly retrieve the attention weights from `torch.nn.functional.scaled_dot_product_attention` because it is a functional call and not a module. Hooks can only be registered on `nn.Module` instances, where you have a `forward` method that defines the computation.

However, if you wish to capture the attention weights from this function, you would typically need to write a custom wrapper module around the `scaled_dot_product_attention` call and then register a hook on that. Here's how you can do it:

### Step 1: Create a Custom Module
First, create a custom module that wraps the `scaled_dot_product_attention` function. This allows you to register hooks on this module.

```python
import torch
import torch.nn.functional as F
import torch.nn as nn

class ScaledDotProductAttention(nn.Module):
    def forward(self, query, key, value, mask=None, dropout=None):
        # Call the F.scaled_dot_product_attention function
        attn_output, attn_output_weights = F.scaled_dot_product_attention(query, key, value, mask, dropout)

        # Instead of returning the output, save the attn_output_weights as a member variable
        self.attn_output_weights = attn_output_weights

        # Return the attention output (which is the result of attention weights on value tensor)
        return attn_output

# Now you can instantiate this module and register a hook on it
attention_module = ScaledDotProductAttention()
```

### Step 2: Register a Hook
After creating the custom module, you can register a hook that will allow you to access the attention weights.

```python
def get_attention_hook(attention_scores_list):
    def hook(module, input, output):
        # Save the attention weights, which are stored as a member variable in the module
        attention_scores_list.append(module.attn_output_weights.detach())
    return hook

# List to store the attention weights
attention_scores_list = []

# Register the hook
hook = attention_module.register_forward_hook(get_attention_hook(attention_scores_list))
```

### Step 3: Use the Custom Module in Your Forward Pass
Now you can use the custom module in your forward pass, and the hook will capture the attention weights.

```python
# Example tensors for query, key, and value
query = torch.rand(10, 32, 512)
key = torch.rand(10, 32, 512)
value = torch.rand(10, 32, 512)

# Perform a forward pass
output = attention_module(query, key, value)

# Check the captured attention weights
print(attention_scores_list)

# Clean up: remove the hook
hook.remove()
```

In this example, after the forward pass, the `attention_scores_list` will contain the attention weights that were used before being multiplied to the `value` tensor. The hook captures these weights via the member variable `attn_output_weights` that we added to the `ScaledDotProductAttention` module.

Keep in mind that this approach requires you to replace direct calls to `torch.nn.functional.scaled_dot_product_attention` with an instance of your custom module. This allows you to leverage hooks and capture the raw attention weights for further analysis or visualization.

In [None]:
class ScaledDotProductAttention(nn.Module):
    def forward(self, query, key, value, mask=None, dropout=None):
        # Call the F.scaled_dot_product_attention function
        attn_output, attn_output_weights = F.scaled_dot_product_attention(query, key, value, mask, dropout)

        # Instead of returning the output, save the attn_output_weights as a member variable
        self.attn_output_weights = attn_output_weights

        # Return the attention output (which is the result of attention weights on value tensor)
        return attn_output

def get_attention_hook(attention_scores_list):
    def hook(module, input, output):
        # Save the attention weights, which are stored as a member variable in the module
        attention_scores_list.append(module.attn_output_weights.detach())
    return hook

# List to store the attention weights
attention_scores_list = []

# Register the hook
hook = attention_module.register_forward_hook(get_attention_hook(attention_scores_list))

### Step 3: Use the Custom Module in Your Forward Pass

# Example tensors for query, key, and value
query = torch.rand(10, 32, 512)
key = torch.rand(10, 32, 512)
value = torch.rand(10, 32, 512)

# Perform a forward pass
output = attention_module(query, key, value)

# Check the captured attention weights
print(attention_scores_list)

# Clean up: remove the hook
hook.remove()

# Our Scaled Dot Product Attention

Mute our tensor generators and use tensors from above implementation

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout = dropout):
        super().__init__()
        
        # Number of attention heads
        self.h = h
        
        # Dimensionality of each attention head
        self.d_k = d_model // h
        
        # Linear layers for query, key, value, and output projections
        self.linears = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(4)])
        
        # Dropout layer
        self.dropout = nn.Dropout(p = dropout)

    @staticmethod
    def scaled_dot_product_attention(query, key, value, mask = None, dropout = None):
        # Compute the dimensionality of each attention head
        d_k = query.size(-1)
        
        # Compute the attention scores using the dot product between query and key
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        
        # Apply the mask to the attention scores (if provided)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        # Apply softmax to obtain the attention probabilities
        p_attn = torch.softmax(scores, dim = -1)
        
        # Apply dropout to the attention probabilities (if specified)
        if dropout is not None:
            p_attn = dropout(p_attn)
        
        # Compute the weighted sum of values using the attention probabilities
        # Return the attended values and attention probabilities
        return torch.matmul(p_attn, value), p_attn

    def forward(self, query, key, value, mask = None):
        # Get the number of batches
        nbatches = query.size(0)
        
        # Project and reshape the query, key, and value for multi-head attention
        # The projections are done using the linear layers defined in __init__
        query, key, value = [
            l(attn_output).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for l, attn_output in zip(self.linears, (query, key, value))
        ]
        
        # Perform scaled dot-product attention on the projected query, key, and value
        attn_output, attn_scores = self.scaled_dot_product_attention(query, key, value, mask = mask, dropout = self.dropout)
        
        # Reshape the attended output and concatenate the attention heads
        attn_output = attn_output.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        
        # Apply a final linear projection to the concatenated output
        attn_output = self.linears[-1](attn_output)

        # Return the attention output
        return attn_output, attn_scores, query, key, value

In [None]:
our_mha = MultiHeadedAttention(h, d_model, dropout).to(device)

# parameters are the 3 copies of the input embedding tensor
our_attn_output, query, key, value, our_attn_scores = our_mha(
    text_embeddings_tensors, text_embeddings_tensors, 
    text_embeddings_tensors)

print(our_attn_output.shape)
print(our_attn_output[0])

print(our_attn_scores.shape)
print(our_attn_scores[0])

print(query[0])
print(key[0])
print(value[0])

# Comparison

In [None]:
def cosine_similarity(tensor1, tensor2):
    tensor1_norm = tensor1 / tensor1.norm(dim=-1, keepdim=True)
    tensor2_norm = tensor2 / tensor2.norm(dim=-1, keepdim=True)
    return (tensor1_norm * tensor2_norm).sum(dim=-1)

# cosine, mse, and correlation
similarity = cosine_similarity(attn_output, our_attn_output)
mse = torch.mean((attn_output - our_attn_output) ** 2)
correlation = torch.corrcoef(torch.stack((attn_output.view(-1), our_attn_output.view(-1))))

print("Cosine Similarity:", similarity)
print("Mean Squared Error:", mse)
print("Correlation Coefficient:", correlation[0, 1])

*Comparing the attention outputs of our scaled dot product function and PyTorch's function is almost similar to comparing the attention outputs of different heads in a multi-head attention system. The differences arise from factors such as random initialization, implementation variations, and numerical precision, but the fundamental principles and functionality of the attention mechanism remain consistent.*

*Random Initialization: The weights of the linear layers used for query, key, and value projections are randomly initialized in both functions. This random initialization leads to different starting points and can result in different attention patterns.*

*Implementation Differences: Although both functions aim to implement the scaled dot product attention mechanism, there might be slight differences in their implementations, such as the order of operations, the use of specific PyTorch functions, or the handling of edge cases. These differences can contribute to variations in the attention outputs.*

*Numerical Precision: The attention outputs can be sensitive to numerical precision, especially when dealing with large input sequences or high-dimensional representations. Differences in numerical precision between our function and PyTorch's function can lead to slight variations in the attention outputs.*

# Pytorch's Transformer Encoder


Look into d_ffn and set it to emb_size (300)
Pytorch Hooks

In [None]:
pytorch_encoder_layer = nn.TransformerEncoderLayer(d_model, h, d_ffn, dropout).to(device)

pytorch_encoded_batches = []  # List to store encoded batches
pytorch_y_batches = []        # List to store corresponding y batches
i = 0

with torch.no_grad():
    for x_batch, y_batch in data_loader:
        print(f'Size of batch: {x_batch.shape}')
        i += 1
        print(i)
        print(f"Total allocated memory: {torch.cuda.memory_allocated()} bytes")
        
        # Move the batch to the device
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        output_features = pytorch_encoder_layer(x_batch)
        
        # Append the encoded batch to the list
        pytorch_encoded_batches.append(output_features)
        pytorch_y_batches.append(y_batch)
        
        print(f'Current Size of Reviews: {len(pytorch_encoded_batches)} tensors')
        print(f'Current Size of Ratings: {len(pytorch_y_batches)} tensors')

# Concatenate all the encoded batches into a single tensor
pytorch_encoded_batch = torch.cat(pytorch_encoded_batches, dim = 0)

# Concatenate all the corresponding y batches into a single tensor
pytorch_y_batch = torch.cat(pytorch_y_batches, dim = 0)

print(total_encoded_batch.attention_weights)

GPT 4 Turbo

You cannot directly return the weights from `nn.TransformerEncoderLayer` without modifying the source code. The `nn.TransformerEncoderLayer` in PyTorch does not provide an out-of-the-box option to return the attention weights along with the output. The attention scores are computed internally within the multi-head attention mechanism, and only the output of the attention layer is returned to the user.

However, there are a few alternative approaches you can take to inspect the attention weights:

1. **Use Hooks**: You can register a forward hook on the multi-head attention submodule within the `nn.TransformerEncoderLayer`. This will let you access the intermediate values, including the attention weights, without modifying the source code. Here's how you could implement this:

```python
import torch

def get_attention_hook(attention_scores_list):
    def hook(module, input, output):
        # output is a tuple in the form (output, attention_weights)
        attention_scores_list.append(output[1].detach())
    return hook

# Example Usage:
model = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)

# Initialize a list to hold the attention scores
attention_scores_list = []

# Register the hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Forward pass with input
input_tensor = torch.rand(10, 32, 512)  # (sequence_length, batch_size, d_model)
output = model(input_tensor)

# Now attention_scores_list contains the attention weights
print(attention_scores_list)

# Don't forget to remove the hook when you're done to prevent memory leaks
hook.remove()
```

2. **Modify the Source Code**: If you're okay with modifying the source code, you can subclass `nn.TransformerEncoderLayer` and override the `forward` method to include the output of the attention scores.

3. **Use a Custom Transformer Implementation**: There are custom implementations of the Transformer model that allow you to return the attention weights directly. For example, the Hugging Face Transformers library provides access to attention weights in models like BERT or GPT-2 when you pass `output_attentions=True` to the model.

If you simply want to inspect the weights and not necessarily return them within the same function, hooks are a great solution. However, if you require the weights to be returned for further processing, you may need to opt for the second or third approach.

I understand. I already have a custom transformer implementation and was looking to compare the weights with those of nn.TransformerEncoderLayer. Can you expand more on the hooks approach?

Certainly! Using hooks in PyTorch is a powerful way to access intermediate data in your models, such as weights and outputs from specific layers during the forward pass, without altering the original source code of those layers. When you want to compare the attention weights of your custom transformer implementation with those of the `nn.TransformerEncoderLayer`, using hooks can be particularly useful.

### Detailed Explanation of Hooks

Hooks in PyTorch can be of two types: **forward hooks** and **backward hooks**. Forward hooks are used to grab information during the forward pass, which is what you'll need for accessing attention weights.

Here’s an expanded explanation on using forward hooks with `nn.TransformerEncoderLayer`:

1. **Defining the Hook Function**: The hook function you define will be called every time the specified layer has completed its forward pass. For the attention mechanism in `TransformerEncoderLayer`, the hook function needs to capture the second element of the output tuple, which contains the attention weights.

2. **Registering the Hook**: You attach the hook to the specific submodule (in this case, the multi-head attention submodule of the encoder layer, `self_attn`). The hook will then store or process the attention weights during each forward pass.

3. **Storing the Outputs**: In the example provided, the hook function appends the attention weights to a list. This list can be used later for comparison or analysis.

4. **Managing the Hook**: It’s important to remove the hook after you're done with it to prevent memory leaks, especially if you're evaluating or training for multiple iterations.

### Expanded Example with Explanations

Here’s an expanded version of the earlier example with more detailed comments:

```python
import torch
import torch.nn as nn

def get_attention_hook(attention_scores_list):
    """Hook function to capture attention weights."""
    def hook(module, input, output):
        # Attention weights are stored in output[1], as the output is a tuple (output, attention_weights)
        attention_scores_list.append(output[1].detach())  # Use detach() to avoid saving gradients
    return hook

# Initialize Transformer Encoder Layer
model = nn.TransformerEncoderLayer(d_model=512, nhead=8)

# List to store attention weights
attention_scores_list = []

# Register hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Create a random tensor to simulate input data (sequence_length, batch_size, d_model)
input_tensor = torch.rand(10, 32, 512)

# Perform a forward pass (this will trigger the hook and capture the attention weights)
output = model(input_tensor)

# Output the captured attention weights
print(attention_scores_list)

# Cleanup: remove the hook to prevent memory leaks
hook.remove()

```

### Using the Captured Data

Once you have the attention weights stored in `attention_scores_list`, you can process them as needed. For instance, you can:

- **Visualize the Attention Patterns**: Use plotting libraries like Matplotlib to visualize how the attention is distributed across input tokens.
- **Compare Weights**: Perform statistical comparisons or compute similarity measures between the attention weights from your custom transformer and the `nn.TransformerEncoderLayer`.
- **Debug and Analyze Model Behavior**: Understanding attention patterns can help debug model behavior or improve model interpretability, especially on specific input examples.

This approach provides a non-intrusive method to extract and analyze the internals of PyTorch models, which can be invaluable for advanced machine learning research and development.

Can I also use hooks to retrieve the attention weights from Pytorch's F.scaled_dot_product_attention before they are multiplied to the value tensor?

In [None]:
def get_attention_hook(attention_scores_list):
    def hook(module, input, output):
        # output is a tuple in the form (output, attention_weights)
        attention_scores_list.append(output[1].detach())
    return hook

# Example Usage:
model = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)

# Initialize a list to hold the attention scores
attention_scores_list = []

# Register the hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Forward pass with input
input_tensor = torch.rand(10, 32, 512)  # (sequence_length, batch_size, d_model)
output = model(input_tensor)

# Now attention_scores_list contains the attention weights
print(attention_scores_list)

# Don't forget to remove the hook when you're done to prevent memory leaks
hook.remove()





def get_attention_hook(attention_scores_list):
    """Hook function to capture attention weights."""
    def hook(module, input, output):
        # Attention weights are stored in output[1], as the output is a tuple (output, attention_weights)
        attention_scores_list.append(output[1].detach())  # Use detach() to avoid saving gradients
    return hook

# Initialize Transformer Encoder Layer
model = nn.TransformerEncoderLayer(d_model=512, nhead=8)

# List to store attention weights
attention_scores_list = []

# Register hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Create a random tensor to simulate input data (sequence_length, batch_size, d_model)
input_tensor = torch.rand(10, 32, 512)

# Perform a forward pass (this will trigger the hook and capture the attention weights)
output = model(input_tensor)

# Output the captured attention weights
print(attention_scores_list)

# Cleanup: remove the hook to prevent memory leaks
hook.remove()

GPT 4

You cannot directly return the weights from `nn.TransformerEncoderLayer` without modifying the source code. The `nn.TransformerEncoderLayer` in PyTorch does not provide an out-of-the-box option to return the attention weights along with the output. The attention scores are computed internally within the multi-head attention mechanism, and only the output of the attention layer is returned to the user.

However, there are a few alternative approaches you can take to inspect the attention weights:

1. **Use Hooks**: You can register a forward hook on the multi-head attention submodule within the `nn.TransformerEncoderLayer`. This will let you access the intermediate values, including the attention weights, without modifying the source code. Here's how you could implement this:

```python
import torch

def get_attention_hook(attention_scores_list):
    def hook(module, input, output):
        # output is a tuple in the form (output, attention_weights)
        attention_scores_list.append(output[1].detach())
    return hook

# Example Usage:
model = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)

# Initialize a list to hold the attention scores
attention_scores_list = []

# Register the hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Forward pass with input
input_tensor = torch.rand(10, 32, 512)  # (sequence_length, batch_size, d_model)
output = model(input_tensor)

# Now attention_scores_list contains the attention weights
print(attention_scores_list)

# Don't forget to remove the hook when you're done to prevent memory leaks
hook.remove()
```

2. **Modify the Source Code**: If you're okay with modifying the source code, you can subclass `nn.TransformerEncoderLayer` and override the `forward` method to include the output of the attention scores.

3. **Use a Custom Transformer Implementation**: There are custom implementations of the Transformer model that allow you to return the attention weights directly. For example, the Hugging Face Transformers library provides access to attention weights in models like BERT or GPT-2 when you pass `output_attentions=True` to the model.

If you simply want to inspect the weights and not necessarily return them within the same function, hooks are a great solution. However, if you require the weights to be returned for further processing, you may need to opt for the second or third approach.

I understand. I already have a custom transformer implementation and was looking to compare the weights with those of nn.TransformerEncoderLayer. Can you expand more on the hooks approach?

Certainly! Using hooks in PyTorch is a powerful way to access intermediate data in your models, such as weights and outputs from specific layers during the forward pass, without altering the original source code of those layers. When you want to compare the attention weights of your custom transformer implementation with those of the `nn.TransformerEncoderLayer`, using hooks can be particularly useful.

### Detailed Explanation of Hooks

Hooks in PyTorch can be of two types: **forward hooks** and **backward hooks**. Forward hooks are used to grab information during the forward pass, which is what you'll need for accessing attention weights.

Here’s an expanded explanation on using forward hooks with `nn.TransformerEncoderLayer`:

1. **Defining the Hook Function**: The hook function you define will be called every time the specified layer has completed its forward pass. For the attention mechanism in `TransformerEncoderLayer`, the hook function needs to capture the second element of the output tuple, which contains the attention weights.

2. **Registering the Hook**: You attach the hook to the specific submodule (in this case, the multi-head attention submodule of the encoder layer, `self_attn`). The hook will then store or process the attention weights during each forward pass.

3. **Storing the Outputs**: In the example provided, the hook function appends the attention weights to a list. This list can be used later for comparison or analysis.

4. **Managing the Hook**: It’s important to remove the hook after you're done with it to prevent memory leaks, especially if you're evaluating or training for multiple iterations.

### Expanded Example with Explanations

Here’s an expanded version of the earlier example with more detailed comments:

```python
import torch
import torch.nn as nn

def get_attention_hook(attention_scores_list):
    """Hook function to capture attention weights."""
    def hook(module, input, output):
        # Attention weights are stored in output[1], as the output is a tuple (output, attention_weights)
        attention_scores_list.append(output[1].detach())  # Use detach() to avoid saving gradients
    return hook

# Initialize Transformer Encoder Layer
model = nn.TransformerEncoderLayer(d_model=512, nhead=8)

# List to store attention weights
attention_scores_list = []

# Register hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Create a random tensor to simulate input data (sequence_length, batch_size, d_model)
input_tensor = torch.rand(10, 32, 512)

# Perform a forward pass (this will trigger the hook and capture the attention weights)
output = model(input_tensor)

# Output the captured attention weights
print(attention_scores_list)

# Cleanup: remove the hook to prevent memory leaks
hook.remove()

```

### Using the Captured Data

Once you have the attention weights stored in `attention_scores_list`, you can process them as needed. For instance, you can:

- **Visualize the Attention Patterns**: Use plotting libraries like Matplotlib to visualize how the attention is distributed across input tokens.
- **Compare Weights**: Perform statistical comparisons or compute similarity measures between the attention weights from your custom transformer and the `nn.TransformerEncoderLayer`.
- **Debug and Analyze Model Behavior**: Understanding attention patterns can help debug model behavior or improve model interpretability, especially on specific input examples.

This approach provides a non-intrusive method to extract and analyze the internals of PyTorch models, which can be invaluable for advanced machine learning research and development.

Can I also use hooks to retrieve the attention weights from Pytorch's F.scaled_dot_product_attention before they are multiplied to the value tensor?

In [None]:
def get_attention_hook(attention_scores_list):
    def hook(module, input, output):
        # output is a tuple in the form (output, attention_weights)
        attention_scores_list.append(output[1].detach())
    return hook

# Example Usage:
model = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)

# Initialize a list to hold the attention scores
attention_scores_list = []

# Register the hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Forward pass with input
input_tensor = torch.rand(10, 32, 512)  # (sequence_length, batch_size, d_model)
output = model(input_tensor)

# Now attention_scores_list contains the attention weights
print(attention_scores_list)

# Don't forget to remove the hook when you're done to prevent memory leaks
hook.remove()




def get_attention_hook(attention_scores_list):
    """Hook function to capture attention weights."""
    def hook(module, input, output):
        # Attention weights are stored in output[1], as the output is a tuple (output, attention_weights)
        attention_scores_list.append(output[1].detach())  # Use detach() to avoid saving gradients
    return hook

# Initialize Transformer Encoder Layer
model = nn.TransformerEncoderLayer(d_model=512, nhead=8)

# List to store attention weights
attention_scores_list = []

# Register hook to the multi-head attention submodule
hook = model.self_attn.register_forward_hook(get_attention_hook(attention_scores_list))

# Create a random tensor to simulate input data (sequence_length, batch_size, d_model)
input_tensor = torch.rand(10, 32, 512)

# Perform a forward pass (this will trigger the hook and capture the attention weights)
output = model(input_tensor)

# Output the captured attention weights
print(attention_scores_list)

# Cleanup: remove the hook to prevent memory leaks
hook.remove()

# Our Transformer Encoder

In [None]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model: int, dropout: float, seq_len: int):
        """
        Initialize the PositionalEncoding module.

        Args:
            d_model (int): The dimensionality of the model (embedding size).
            dropout (float): The dropout rate to be applied.
            seq_len (int): The maximum sequence length.
        """
        super().__init__()
        self.dropout = nn.Dropout(p = dropout)

        # Create a tensor of shape (seq_len, 1) representing the positions
        position = torch.arange(seq_len).unsqueeze(1)
        
        # Compute the denominator term for the sinusoidal positional encoding
        # The denominator is 10000^(2i/d_model), where i is the position index
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        
        # Create a tensor of shape (seq_len, 1, d_model) to store the positional encodings
        pe = torch.zeros(seq_len, 1, d_model)
        
        # Compute the sinusoidal positional encodings for even dimensions
        # pe[:, 0, 0::2] selects the even dimensions (0, 2, 4, ...) of the positional encoding tensor
        # position * div_term computes the angle for each position and even dimension
        # torch.sin applies the sine function element-wise to compute the positional encoding values
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        
        # Compute the sinusoidal positional encodings for odd dimensions
        # pe[:, 0, 1::2] selects the odd dimensions (1, 3, 5, ...) of the positional encoding tensor
        # torch.cos applies the cosine function element-wise to compute the positional encoding values
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        
        # Register the positional encoding tensor as a buffer in the module
        # This ensures that the positional encoding tensor is saved and loaded with the module
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Perform the forward pass of the PositionalEncoding module.

        Args:
            x (Tensor): The input tensor of shape (seq_len, batch_size, embedding_dim).

        Returns:
            Tensor: The input tensor with positional encodings added.
        """
        # Add positional encodings to the input tensor
        # x.size(0) returns the sequence length dimension of the input tensor
        # self.pe[:x.size(0)] selects the positional encodings corresponding to the sequence length
        x = x + self.pe[:x.size(0)]
        
        # Apply dropout to the tensor with positional encodings
        return self.dropout(x)

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=dropout):
        super().__init__()
        
        # Number of attention heads
        self.h = h
        
        # Dimensionality of each attention head
        self.d_k = d_model // h
        
        # Linear layers for query, key, value, and output projections
        self.linears = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(4)])
        
        # Dropout layer
        self.dropout = nn.Dropout(p = dropout)

    @staticmethod
    def scaled_dot_product_attention(query, key, value, mask = None, dropout = None):
        # Compute the dimensionality of each attention head
        d_k = query.size(-1)
        
        # Compute the attention scores using the dot product between query and key
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        
        # Apply the mask to the attention scores (if provided)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        # Apply softmax to obtain the attention probabilities
        p_attn = torch.softmax(scores, dim = -1)
        
        # Apply dropout to the attention probabilities (if specified)
        if dropout is not None:
            p_attn = dropout(p_attn)
        
        # Compute the weighted sum of values using the attention probabilities
        # Return the attended values and attention probabilities
        return torch.matmul(p_attn, value), p_attn

    def forward(self, query, key, value, mask = None):
        # Get the number of batches
        nbatches = query.size(0)
        
        # Project and reshape the query, key, and value for multi-head attention
        # The projections are done using the linear layers defined in __init__
        query, key, value = [
            l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for l, x in zip(self.linears, (query, key, value))
        ]
        
        # Perform scaled dot-product attention on the projected query, key, and value
        x = self.scaled_dot_product_attention(query, key, value, mask = mask, dropout = self.dropout)
        
        # Reshape the attended output and concatenate the attention heads
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        
        # Apply a final linear projection to the concatenated output
        x = self.linears[-1](x)

        # Return the attention output
        return x

In [None]:
class LayerNorm(nn.Module):
    """
    Construct a layernorm module (See citation for details).
    
    Layer normalization is a technique to normalize the activations of a layer.
    It helps stabilize the training process and can lead to faster convergence.
    
    This implementation follows the original paper:
    "Layer Normalization" by Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton
    https://arxiv.org/abs/1607.06450
    """
    
    def __init__(self, features, eps = eps):
        """
        Initialize the LayerNorm module.
        
        Args:
            features (int): The number of features (channels) in the input tensor.
            eps (float): A small value added to the variance for numerical stability.
                         Default is 1e-6.
        """
        super(LayerNorm, self).__init__()
        
        # Create learnable parameters for scaling and shifting
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        
        self.eps = eps
    
    def forward(self, x):
        """
        Perform layer normalization on the input tensor.
        
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, ..., features).
        
        Returns:
            torch.Tensor: Normalized tensor of the same shape as the input.
        """
        # Compute the mean across the last dimension (features)
        mean = x.mean(-1, keepdim=True)
        
        # Compute the standard deviation across the last dimension (features)
        std = x.std(-1, keepdim=True)
        
        # Normalize the input tensor
        x_normalized = (x - mean) / (std + self.eps)
        
        # Scale and shift the normalized tensor
        out = self.a_2 * x_normalized + self.b_2
        
        return out

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model: int, d_ffn: int, dropout: float):
        """
        Initializes the PositionwiseFeedForward module.

        Args:
            d_model (int): The dimension of the input embeddings.
            d_ffn (int): The dimension of the hidden layer in the feed-forward network.
            dropout (float): The probability of dropout occurring.
        """
        super().__init__()
        
        # Linear layer that maps from the input dimension (d_model) to the hidden dimension (d_ffn)
        self.w_1 = nn.Linear(d_model, d_ffn)
        
        # Linear layer that maps from the hidden dimension (d_ffn) back to the input dimension (d_model)
        self.w_2 = nn.Linear(d_ffn, d_model)
        
        # Dropout layer with the specified dropout probability
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Performs the forward pass of the PositionwiseFeedForward module.

        Args:
            x (torch.Tensor): The input tensor of shape (batch_size, seq_length, d_model),
                              representing the output from the attention mechanism.

        Returns:
            torch.Tensor: The output tensor of shape (batch_size, seq_length, d_model),
                          representing the expanded-and-contracted representation.
        """
        
        # Apply the first linear transformation (w_1) to the input tensor (x)
        # This maps the input from the embedding dimension (d_model) to the hidden dimension (d_ffn)
        hidden = self.w_1(x)
        
        # Apply the ReLU activation function to the hidden representation
        activated = torch.relu(hidden)
        
        # Apply dropout to the activated hidden representation
        dropped = self.dropout(activated)
        
        # Apply the second linear transformation (w_2) to the dropped representation
        # This maps the hidden dimension (d_ffn) back to the embedding dimension (d_model)
        output = self.w_2(dropped)
        
        # Return the final output tensor
        return output

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ffn, dropout):
        """
        Initialize an encoder layer.

        Args:
            d_model (int): The dimension of the input and output of the layer.
            num_heads (int): The number of attention heads.
            d_ffn (int): The dimension of the feedforward network.
            dropout (float): The dropout probability.
        """
        super().__init__()
        self.self_attn = MultiHeadedAttention(num_heads, d_model, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ffn, dropout)
        self.sublayer = nn.ModuleList([LayerNorm(d_model) for _ in range(2)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        """
        Perform the forward pass of the encoder layer.

        Args:
            x (torch.Tensor): The input tensor of shape (batch_size, sequence_length, d_model).
            mask (torch.Tensor): The attention mask tensor of shape (batch_size, 1, sequence_length, sequence_length).

        Returns:
            torch.Tensor: The output tensor of the encoder layer.
        """
        sublayer_output, attn = self.self_attn(x, x, x, mask)
        x = x + self.dropout(sublayer_output)  # Apply dropout to the sublayer output before adding it to the input
        x = self.sublayer[0](x)  # Apply normalization after the residual connection
        sublayer_output = self.feed_forward(x)
        x = x + self.dropout(sublayer_output)  # Again, apply dropout before the residual connection
        x = self.sublayer[1](x)  # Apply normalization after the residual connection
        return x

In [None]:
class StackedEncoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ffn, dropout):
        """
        Initialize a stacked encoder.

        Args:
            num_layers (int): The number of encoder layers.
            d_model (int): The dimension of the input and output of each layer.
            num_heads (int): The number of attention heads in each layer.
            d_ffn (int): The dimension of the feedforward network in each layer.
            dropout (float): The dropout probability.
        """
        super().__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ffn, dropout) for _ in range(num_layers)
        ])
        self.norm = LayerNorm(d_model)

    def forward(self, x, mask):
        """
        Perform the forward pass of the stacked encoder.

        Args:
            x (torch.Tensor): The input tensor of shape (batch_size, sequence_length, d_model).
            mask (torch.Tensor): The attention mask tensor of shape (batch_size, 1, sequence_length, sequence_length).

        Returns:
            torch.Tensor: The output tensor of the stacked encoder.
        """
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)  # Apply normalization to the output of the last layer

In [None]:
stacked_encoder = StackedEncoder(num_layers, d_model, 
                                 h, d_ffn, dropout).to(device)

# encoder information
print('Below is the complete encoder: \n')
print(stacked_encoder)

In [None]:
!nvidia-smi

In [None]:
total_encoded_batches = []  # List to store encoded batches
total_y_batches = []        # List to store corresponding y batches

i = 0

with torch.no_grad():
    for x_batch, y_batch in data_loader:
        print(f'Size of batch: {x_batch.shape}')
        i += 1
        print(i)
        print(f"Total allocated memory: {torch.cuda.memory_allocated()} bytes")
        
        # Move the batch to the device
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        # Encode the batch using the stacked_encoder
        encoded_batch = stacked_encoder(x_batch, mask = None)
        
        # Append the encoded batch to the list
        total_encoded_batches.append(encoded_batch)
        total_y_batches.append(y_batch)
        
        print(f'Current Size of Reviews: {len(total_encoded_batches)} tensors')
        print(f'Current Size of Ratings: {len(total_y_batches)} tensors')

# Concatenate all the encoded batches into a single tensor
total_encoded_batch = torch.cat(total_encoded_batches, dim = 0)

# Concatenate all the corresponding y batches into a single tensor
total_y_batch = torch.cat(total_y_batches, dim = 0)

In [None]:
# Check the shape of the tensors
print(total_encoded_batch.shape)
print(total_y_batch.shape)
print('\n')

# size in MB
print(f'total_encoded_batch in bytes: { total_encoded_batch.nelement() * total_encoded_batch.element_size() }')
print(f'total_y_batch in bytes: { total_y_batch.nelement() * total_y_batch.element_size() }')
print('\n')

# position of the tensors
print(f"The Encoded batch is on: {total_encoded_batch.device}")
print(f"The Y batch is on: {total_y_batch.device}")
print('\n')

Training approaches like masked-language modeling next sentence prediction, permutative language modeling, contrastive learning

1. Visualization: You can visualize the input and output tensors to see if there are any noticeable patterns or differences. Techniques like heatmaps, scatter plots, or dimensionality reduction methods (e.g., t-SNE, PCA) can help you visualize high-dimensional data in a more interpretable way.

2. Comparing input and output: You can directly compare the input and output tensors element-wise to see how the values have changed. This can give you an idea of the magnitude and direction of the transformations applied by the encoder.

3. Analyzing attention weights: If our transformer encoder uses attention mechanisms, you can examine the attention weights to understand which parts of the input the model is focusing on. Higher attention weights indicate that the model is paying more attention to those specific positions or features.

4. Probing the learned representations: You can train a separate model (e.g., a classifier or regressor) on top of the encoded representations to see if they capture meaningful information for a downstream task. If the model performs well, it suggests that the encoder has learned useful representations.

5. Ablation studies: You can systematically remove or modify certain components of the transformer encoder (e.g., attention layers, normalization) and observe how the output changes. This can help you understand the role and impact of each component on the encoding process.

6. Analyzing the distribution of values: You can compute statistics like mean, variance, and range of the input and output tensors to see how the distribution of values has changed. This can provide insights into the overall effect of the encoder on the data.

7. Gradient-based methods: If you have access to the gradients of the encoder, you can use techniques like saliency maps or gradient-based attribution methods to identify which input features have the most influence on the output.

Keep in mind that interpreting the behavior of deep learning models can be challenging, especially for complex architectures like transformers. It often requires a combination of different analysis techniques and domain knowledge to gain meaningful insights.

Remember to normalize or scale the input and output tensors appropriately before visualization or comparison, as the raw floating-point values may have different scales and ranges.