<a href="https://colab.research.google.com/github/git-ekeh/custom_dsw_transformer/blob/main/DSW_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

class WindowedAttention(Layer):
    '''
    Initialize the `WindowedAttention` layer with hyperparameters and required
    dense layers
    '''
    def __init__(self, num_heads, d_model, window_size, dilation=1, global_attention_indices=None):
        super(WindowedAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.window_size = window_size
        self.dilation = dilation
        self.global_attention_indices = global_attention_indices

        self.depth = d_model // num_heads
        self.wq = tf.keras.layers.Dense(d_model) # regular densely-connected NN Layer
        self.wk = tf.keras.layers.Dense(d_model) # regular densely-connected NN Layer
        self.wv = tf.keras.layers.Dense(d_model) # regular densely-connected NN Layer
        self.dense = tf.keras.layers.Dense(d_model) # regular densely-connected NN Layer
        self.dropout = tf.keras.layers.Dropout
        #self.layernorm = tf.keras.layers.LayerNormalization()
        #self.add = tf.keras.layers.Add()


    def split_heads(self, x, batch_size):
        '''
        Define the `split_heads` function which reshapes and transposes the
        input tensor to separate the attention heads
        '''
        seq_len = tf.shape(x)[1]
        x = tf.reshape(x, (batch_size, seq_len, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0,2,1,3])

    def call(self, q, k, v, mask):
        '''
        Define the `call` function, which is executed when the layer is called.
        This function computes the attention weights and output of the
        windowed attention layer
        '''

        batch_size = tf.shape(q)[0]
        seq_len = tf.shape(q)[1]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)


        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        #print("q shape after split_heads:", q.shape)
        #print("k shape after split_heads:", k.shape)
        #print("v shape after split_heads:", v.shape)


        # Create the mask using the create_window_mask function
        mask = create_window_mask(seq_len, self.window_size, self.dilation, self.global_attention_indices)
        mask = tf.cast(mask, dtype=tf.float32)
        mask = tf.expand_dims(mask, axis=0) # Expand the dimensions
        mask = tf.expand_dims(mask, axis=1) # Expand the dimensions
        # Make sure the mask is compatible with the shape of the scaled_attention_logits
        mask = tf.broadcast_to(mask, [batch_size, self.num_heads, seq_len, seq_len])  # Broadcast the mask to match the shape of scaled_attention_logits

        scaled_attention_logits = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.depth, tf.float32))
        mask = tf.stop_gradient(mask)
        scaled_attention_logits += (mask * -1e9)
        #print("scaled_attention_logits shape:", scaled_attention_logits.shape)
        #print("mask shape:", mask.shape)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        #print("attention_weights shape:", attention_weights.shape)
        #attention_weights = self.dropout(attention_weights)



        output = tf.matmul(attention_weights, v)
        #print("output shape before dense layer:", output.shape)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        seq_len = tf.shape(q)[2] # Get the sequence length from the q tensor

        output = tf.reshape(output, (batch_size, seq_len, self.d_model))

        output = self.dense(output)
        #print("output shape after dense layer:", output.shape)


        return output

def create_window_mask(seq_len, window_size, dilation=1, global_attention_indices=None):
    '''
    The `create_window_mask` function creates a boolean mask based on the input
    sequence length, window size, dilation, and global attention indices. The purpose
    of this mask is to specify which tokens should attend to each other within the
    windowed and dilated attention mechanism
    '''
    mask = tf.eye(seq_len, dtype=tf.bool) # Create an identity matrix of shape (seq_len, seq_len) with a boolean data type. This sets the intitial mask, where each token attends to itself
    half_window = window_size // 2 # Calculate half of the window size

    def while_body(j, mask):
        computed_value = seq_len - j * dilation
        diag = tf.ones(computed_value, dtype=tf.bool)
        mask = tf.linalg.set_diag(mask, diag, k=j*dilation)
        mask = tf.linalg.set_diag(mask, diag, k=-j*dilation)
        return j + 1, mask
    def while_cond(j, _):
        computed_value = seq_len - j * dilation
        return computed_value > 0
    _, mask = tf.while_loop(while_cond, while_body, (1, mask))

    if global_attention_indices is not None:
        '''
        if global attention indices are provided, loop through each index and
        set the corresponding diagonals in the mask tensor to ones. This ensures
        that tokens with global attention attend to all tokens across the sequences
        and vice versa
        '''
        for idx in global_attention_indices:
            if idx == 0:
                continue
            diag_length = min(idx, seq_len - idx)
            diag = tf.ones(diag_length, dtype=tf.bool)
            mask = tf.linalg.set_diag(mask, diag, k=idx)
            mask = tf.linalg.set_diag(mask, diag, k=-idx)
    mask = tf.stop_gradient(mask)
    return mask


class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

'''
The cross attention layer:

---> At the literal center for the Transformer is the cross-attention layer. This layer connects the
     encoder and decoder. This layer is the most straight-forward use of attention in the model

---> To implement this you pass the target sequence x as the query and the context sequence as the
     key/value when calling the mha layer
'''

class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)

        # Cache the attention scores for plotting later
        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x



#mask = create_window_mask(seq_len=100, window_size=10, dilation=2, global_attention_indices=[0,50])
#attention_layer = WindowedAttention(num_heads=8, d_model=512, window_size=10, dilation=2, global_attention_indices=[0,50])


In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])

        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x


In [None]:
!pip3 install spacy

In [None]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

'''
- Needs to take in the source documents and vectorize them
- Then choose the top keywords in the document
- Use those as guidance signals for training
- for each source document, run it through the TextRank for keyword extraction
'''

class TextRank4Keyword():
    '''Extract keywords from text'''

    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # converge threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    def set_stopwords(self, stopwords):
        '''Set stop words'''
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

    def sentence_segment(self, doc, candidate_pos, lower):
        '''Store those words only in candidate_pos'''
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with candidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences

    def get_vocab(self, sentences):
        '''Get all tokens'''
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab

    def get_token_pairs(self, window_size, sentences):
        '''Build token_pairs from windows in sentences'''
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs

    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())

    def get_matrix(self, vocab, token_pairs):
        '''Get normalized matrix'''
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1

        # Get Symmetric matrix
        g = self.symmetrize(g)

        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # thisd is to ignore the 0 element in norm

        return g_norm

    def get_keywords(self, number=10):
        '''Print top number keywords'''
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        keyword_store = []
        for i, (key,value) in enumerate(node_weight.items()):
            keyword_store.append(str(key))
            #print(key + ' - ' + str(value))
            #print(key)

            if i > number:
                break
        return keyword_store



    def analyze(self, text,
                candidate_pos=['NOUN','PROPN'],
                window_size=4, lower=False, stopwords=list()):
        '''Main function to analyze text'''

        # Set stop words
        self.set_stopwords(stopwords)

        # Parse text by spaCy
        doc = nlp(text)

        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words

        # Build vocabulary
        vocab = self.get_vocab(sentences)

        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)

        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)

        # Initialization for weight(pagerank value)
        pr = np.array([1] * len(vocab))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        self.node_weight = node_weight


In [None]:
def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]  # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)

    angle_rates = 1 / (10000**depths) #(1, depth)
    angle_rads = positions * angle_rates #(pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the emebdding and positional_encoding
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x


In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, window_size, dilation=1, global_attention_indices=None, dropout_rate=0.1):
        super().__init__()

        self.windowed_attention = WindowedAttention(
            num_heads=num_heads,
            d_model=d_model,
            window_size=window_size,
            dilation=dilation)

        self.ffn = FeedForward(d_model, dff)

    def call(self, x, mask=None):
        #print('x shape before the windowed_attention layer', x.shape)
        x = self.windowed_attention(x,k=x,v=x, mask=mask) #removed mask
        #print('x shape before the feedforward layer', x.shape)
        x = self.ffn(x)
        return x

class Encoder(tf.keras.layers.Layer):

    def __init__(self, *, num_layers, d_model, num_heads, dff, window_size, dilation=1, global_attention_indices=None, vocab_size, dropout_rate=0.1):

        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         dff=dff,
                         window_size=window_size,
                         dilation=dilation,
                         global_attention_indices=global_attention_indices,
                         dropout_rate=dropout_rate)
                         for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, mask=None):
        # `x` is token-IDs shape: (batch, seq_len)
        x = self.pos_embedding(x) # Shape `(batch_size, seq_len, d_model)`

        # Add dropout
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask=mask) # Pass the training flag also removed mask=mask

        return x #Shape `(batch_size, seq_len, d_model)`


In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, window_size, dilation, dropout_rate=0.1):

                super(DecoderLayer, self).__init__()

                self.windowed_self_attention = WindowedAttention(
                    num_heads=num_heads,
                    d_model=d_model,
                    window_size=window_size,
                    dilation=dilation)

                self.cross_attention0 = CrossAttention(
                    num_heads=num_heads,
                    key_dim=d_model,
                    dropout=dropout_rate)

                self.cross_attention1 = CrossAttention(
                    num_heads=num_heads,
                    key_dim=d_model,
                    dropout=dropout_rate)


                self.ffn = FeedForward(d_model, dff)

    def call(self, x, context0, context1,mask=None, global_attention_indices=None): #global_attention_indices=None
        # gotta build two cross attention mechanisms, one for guidance and one for source doc

        x = self.windowed_self_attention(x, x, x, mask) # output embedding
        guidance = self.cross_attention0(x=x, context=context0) #guidance multiplied by the output embedding

        source = self.cross_attention1(x=guidance, context=context1) #guidance multiplied by the contents of the source document

        final = self.ffn(source)  # Shape `(batch_size, seq_len, d_model)`.
        #print("final shape:", final.shape)
        return final # instead of x


class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, window_size, dilation, global_attention_indices=None, dropout_rate=0.1):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
                    DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, window_size=window_size, dilation=dilation,dropout_rate=dropout_rate)
                    for _ in range(num_layers)]

    def call(self, x, context0, context1, mask=None, global_attention_indices=None):
        # `x` is token-IDs shape (batch, target_seq_len)
        x = self.pos_embedding(x) # (batch_size, target_seq_len, d_model)
        #print("x shape after pos_embedding:", x.shape)


        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, context0, context1, mask=mask, global_attention_indices=global_attention_indices)



        # The shape of x is (batch_size, target_seq_len, d_model)
        return x


In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, window_size, dilation, dropout_rate=0.1):
        super().__init__()
        self.encoder0 = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=input_vocab_size, window_size=window_size,dropout_rate=dropout_rate)
        self.encoder1 = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=input_vocab_size, window_size=window_size,dropout_rate=dropout_rate)

        self.decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=target_vocab_size, window_size=window_size, dilation=dilation, dropout_rate=dropout_rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs):

        document = inputs['input_ids']
        guidance = inputs['guidance_inputs']
        x = inputs['decoder_input_ids']


        # Process the document and guidance features in parallel through the two encoders
        context0 = self.encoder0(document)
        context1 = self.encoder1(guidance)


        x = self.decoder(x, context0, context1, global_attention_indices=None) # (batch_size, target_len, d_model)

        # Final linear layer output.
        logits = self.final_layer(x) # (batch_size, target_len, target_vocab_size)

        try:
            # Drop the keras mask, so it does not scale the losses/metrics
            # b/250038731
            del logits._keras_mask
        except AttributeError:
            pass

        # Return the final output and the attention weights
        return logits


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip3 install datasets
!pip3 install transformers
!pip3 install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224

In [None]:
import pickle
import tensorflow as tf
import datasets
import transformers
from transformers import DataCollatorWithPadding
from transformers import BertTokenizerFast, AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets import load_dataset, Dataset
import pandas as pd
import collections
import itertools
import os

# Open the file in rad-binary mode
with open('train_dict.pkl', 'rb') as f:
    # Load the pickled object from the file
    unpickled_obj_train = pickle.load(f)

with open('val_dict.pkl','rb') as f:
    # Load the pickled validation set from the pickled file
    unpickled_obj_val = pickle.load(f)

'''
My pickling job wasn't perfect but WE MOVE
All 204045 examples were preprocessed into
one file train_dict.pkl

Ideal
-----
To accomodate I will split the examples
into train: 70, val: 20, test: 10 split

The data type is a dictionary

keys are strings of integers

Current
-------
Going to train using train_dict
for both test and val
'''

# Convert dictionary to dataframe and transpose them to get the appropriate features of the dataframe
df_train = pd.DataFrame.from_dict(unpickled_obj_train)

df_val = pd.DataFrame.from_dict(unpickled_obj_val)
# Transpose the dataframe
transpose_df_train = df_train.T
transpose_df_val = df_val.T
# Convert the dataframe rows to lists to avoid error: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
transpose_df_train['id'] = transpose_df_train['id'].values.tolist()
transpose_df_val['title'] = transpose_df_val['title'].values.tolist()
# Convert Dataframes to Datasets
dataset_train = Dataset.from_pandas(transpose_df_train)
dataset_train = dataset_train.remove_columns("__index_level_0__")
dataset_train = dataset_train.remove_columns("id")

dataset_val = Dataset.from_pandas(transpose_df_val)
dataset_val = dataset_val.remove_columns("__index_level_0__")
dataset_val = dataset_val.remove_columns("title")

#Tokenization
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token


batch_size = 32 # this is for full training
encoder_max_length = 512
decoder_max_length = 512
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["document"],  padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)


    #Concatenate guifance strings using the [SEP] token
    guidance_text = ["[SEP]".join(guidance) for guidance in batch["guidance"]]
    guidance = tokenizer(guidance_text, padding="max_length", truncation=True, max_length=encoder_max_length)


    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["guidance_inputs"] = guidance.input_ids
    batch["guidance_attention_mask"] = guidance.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exctly to `decoder_input_ids`
    # We have to make sure that he PAD token is ignored
    #batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    batch["labels"] = [[tokenizer.pad_token_id if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]


    return batch

dataset_train = dataset_train.map(process_data_to_model_inputs,
                                  batched=True,
                                  batch_size = batch_size)

def process_data_to_model_inputs_billsum(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["text"],  padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)


    #Concatenate guifance strings using the [SEP] token
    guidance_text = ["[SEP]".join(guidance) for guidance in batch["guidance"]]
    guidance = tokenizer(guidance_text, padding="max_length", truncation=True, max_length=encoder_max_length)


    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["guidance_inputs"] = guidance.input_ids
    batch["guidance_attention_mask"] = guidance.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exctly to `decoder_input_ids`
    # We have to make sure that he PAD token is ignored
    #batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    batch["labels"] = [[tokenizer.pad_token_id if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]


    return batch


dataset_val = dataset_val.map(process_data_to_model_inputs_billsum,
                                  batched=True,
                                  batch_size = batch_size)

# Converting the recently tokenized data into a tensorflow dataset
dataset_train = dataset_train.to_tf_dataset(
    columns=["input_ids","attention_mask","guidance_inputs","guidance_attention_mask","decoder_input_ids", "decoder_attention_mask", "labels"],
    label_cols=["labels"],
    batch_size = 2,
    collate_fn=data_collator,
    shuffle=True
)
'''
# Saving this tensorflow datasets
saved_dataset_train_dir = 'saved_train_tensors'
if not os.path.exists(saved_dataset_train_dir):
    os.makedirs(saved_dataset_train_dir)
tf.data.experimental.save(
                            dataset_train,
                            saved_dataset_train_dir,
                            compression='GZIP'
)

'''

dataset_val = dataset_val.to_tf_dataset(
    columns=["input_ids","attention_mask","guidance_inputs","guidance_attention_mask","decoder_input_ids", "decoder_attention_mask", "labels"],
    label_cols=["labels"],
    batch_size = 2,
    collate_fn=data_collator,
    shuffle=True
)






Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
from rouge_score import rouge_scorer

'''
# Saving this tensorflow datasets

saved_dataset_validation_dir = 'saved_validation_tensors'
if not os.path.exists(saved_dataset_validation_dir):
    os.makedirs(saved_dataset_validation_dir)
tf.data.experimental.save(
                            dataset_val,
                            saved_dataset_validation_dir,
                            compression='GZIP'
)

'''


# Define Hyperparameters
num_layers = 4
d_model = 128
num_heads = 8
dff = 512
input_vocab_size = tokenizer.vocab_size
target_vocab_size = tokenizer.vocab_size
window_size = 130 # changed from 128
dilation = 1
dropout_rate = 0.1
epochs = 10
'''
# Creating Rouge Score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Define a custom function that computes the ROUGE score between y_true and y_pred
def compute_rouge(y_true, y_pred):
    # Convert the y_true and y_pred tensors to Python lists of strings
    true_texts = tokenizer.batch_decode(y_true.np(), skip_special_tokens=True)
    pred_texts = tokenizer.batch_decode(y_pred.np(), skip_special_tokens=True)

    # Compute the ROUGE scores using the RougeScorer object
    rouge_scores = scorer.score(true_texts, pred_texts)

    # Return a dictionary of ROUGE scores
    return {'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure}
'''
# Define the metrics to use during training and validation
metrics = ['accuracy']

# Constructing the model

model = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff,
                    input_vocab_size=input_vocab_size, target_vocab_size = target_vocab_size,
                    window_size = window_size, dilation=dilation,dropout_rate=dropout_rate)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
model.compile(optimizer=optimizer, loss=loss_function, metrics=metrics) # can also use any keras loss function

history = model.fit(dataset_train, epochs=epochs, validation_data=dataset_val)



Epoch 1/10

KeyboardInterrupt: ignored

In [None]:
history_dict = history.history
'''
This should provide a dictionary with keys like:
 - 'loss'
 - 'accuracy'
 - 'val_loss'
 - 'val_accuracy'
'''
import matplotlib.pyplot as plt


def plot_training_history(history_dict):
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']
    accuracy = history_dict['accuracy']
    val_accuracy = history_dict['val_accuracy']
    epochs = range(1, len(loss) + 1)

    # Plot training and validation loss
    plt.figure(figsize=(12, 6))
    plt.subplot(1,2,1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
plot_training_history(history_dict)