<a href="https://colab.research.google.com/github/dude123studios/AdvancedDeepLearning/blob/main/Text_Summarization_with_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import tensorflow.keras.layers as l
from tensorflow.keras.models import Model, Sequential
import pandas as pd
import numpy as np
import os

In [3]:
from google.colab import files
files.upload() # Browse for the kaggle.json file that you downloaded

# Make directory named kaggle, copy kaggle.json file there, and change the permissions of the file.
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

! kaggle datasets download -d pariza/bbc-news-summary --path '/usr/local' --unzip

Saving kaggle.json to kaggle.json
Downloading bbc-news-summary.zip to /usr/local
  0% 0.00/8.91M [00:00<?, ?B/s]
100% 8.91M/8.91M [00:00<00:00, 82.0MB/s]


In [4]:
def generate_articles(path):
    articles = []
    for sub in os.listdir(path):
        article = ''
        try: 
            f = open(os.path.join(path, sub))
            for line in f:
                article += line.strip()
            articles.append(article)
        except:
            article = 'error'
            articles.append(article)
    return articles

In [5]:
texts = []
texts.extend(generate_articles('/usr/local/BBC News Summary/News Articles/business'))
texts.extend(generate_articles('/usr/local/BBC News Summary/News Articles/entertainment'))
texts.extend(generate_articles('/usr/local/BBC News Summary/News Articles/politics'))
texts.extend(generate_articles('/usr/local/BBC News Summary/News Articles/sport'))
texts.extend(generate_articles('/usr/local/BBC News Summary/News Articles/tech'))

summaries = []
summaries.extend(generate_articles('/usr/local/BBC News Summary/Summaries/business'))
summaries.extend(generate_articles('/usr/local/BBC News Summary/Summaries/entertainment'))
summaries.extend(generate_articles('/usr/local/BBC News Summary/Summaries/politics'))
summaries.extend(generate_articles('/usr/local/BBC News Summary/Summaries/sport'))
summaries.extend(generate_articles('/usr/local/BBC News Summary/Summaries/tech'))

In [6]:
def tokenize(text, vocab_size = 10000, lower = True):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size,
                                                      oov_token='<unk>',
                                                      lower=lower,
                                                      filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
    tokenizer.fit_on_texts(text)
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'
    word2idx = tokenizer.word_index
    idx2word = {v:k for k,v in word2idx.items()}
    return word2idx, idx2word, tokenizer

In [7]:
texts = [f'<start> {t} <end>' for t in texts]
summaries = [f'<start> {t} <end>' for t in summaries]
word2idx_text, idx2word_text, text_tokenizer = tokenize(texts)
word2idx_summary, idx2word_summary, summary_tokenizer = tokenize(summaries)

In [8]:
sequence_lengths = np.array([len(s.split()) for s in texts])
print([(p, np.percentile(sequence_lengths, p))
  for p in [75, 80, 90, 95, 99, 100]
])

[(75, 467.0), (80, 500.0), (90, 610.6000000000001), (95, 725.0), (99, 995.2799999999993), (100, 4379.0)]


In [9]:
sequence_lengths = np.array([len(s.split()) for s in summaries])
print([(p, np.percentile(sequence_lengths, p))
  for p in [75, 80, 90, 95, 99, 100]
])

[(75, 204.0), (80, 221.0), (90, 270.0), (95, 315.0), (99, 434.75999999999976), (100, 2075.0)]


In [10]:
print(texts[0])
print(summaries[0])

<start> France Telecom gets Orange boostStrong growth in subscriptions to mobile phone network Orange has helped boost profits at owner France Telecom.Orange added more than five million new customers in 2004, leading to a 10% increase in its revenues. Increased take-up of broadband telecoms services also boosted France Telecom's profits, which showed a 5.5% rise to 18.3bn euros ($23.4bn; £12.5bn). France Telecom is to spend 578m euros on buying out minority shareholders in data services provider Equant.France Telecom, one of the world's largest telecoms and internet service providers, saw its full-year sales rise 2.2% to 47.2bn euros in 2004.Orange enjoyed strong growth outside France and the United Kingdom - its core markets - swelling its subscriber base to 5.4 million. France Telecom's broadband customers also increased, rising to 5.1 million across Europe by the end of the year. The firm said it had met its main strategic objectives of growing its individual businesses and further

In [11]:
max_seqlen_text = 700
text_as_ints = text_tokenizer.texts_to_sequences(texts)
text_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    text_as_ints, maxlen=max_seqlen_text, padding='post')
max_seqlen_summary = 701
summaries_as_ints = summary_tokenizer.texts_to_sequences(summaries)
summaries_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    summaries_as_ints, maxlen = max_seqlen_summary, padding='post')
dataset = tf.data.Dataset.from_tensor_slices((text_as_ints, summaries_as_ints))


In [12]:
    BATCH_SIZE = 64
    BUFFER_SIZE = 1000
    
    dataset = dataset.map(lambda x, y: {"source": x, "target": y})
    dataset = dataset.batch(BATCH_SIZE).shuffle(BUFFER_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
class TokenEmbedding(l.Layer):
  def __init__(self, maxlen, num_vocab = 10000, num_hid = 64):
    super().__init__()
    self.embedding = l.Embedding(num_vocab + 3, num_hid)
    self.pos_embedding = l.Embedding(maxlen, num_hid)
  
  def call(self, x):
    maxlen = tf.shape(x)[-1]
    x = self.embedding(x)
    positions = self.pos_embedding(tf.range(0, maxlen, delta=1))
    return x + positions

In [14]:
class TransformerEncoder(l.Layer):
  def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
    super().__init__()
    self.attn = l.MultiHeadAttention(num_heads, embed_dim)
    self.ffn = tf.keras.Sequential([
        l.Dense(feed_forward_dim, 'relu'),
        l.Dense(embed_dim)                   
    ])
    self.norm1 = l.LayerNormalization(epsilon=1e-6)
    self.norm2 = l.LayerNormalization(epsilon=1e-6)

    self.dropout1 = l.Dropout(rate=rate)
    self.dropout2 = l.Dropout(rate=rate)

  def call(self, inputs, training):
    attn = self.attn(inputs, inputs)
    attn_output = self.dropout1(attn, training=training)
    out1 = self.norm1(inputs + attn)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output)
    return self.norm2(out1 + ffn_output)

In [15]:
class TransformerDecoder(l.Layer):
  def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
    super().__init__()
    self.norm1 = l.LayerNormalization(epsilon=1e-6)
    self.norm2 = l.LayerNormalization(epsilon=1e-6)
    self.norm3 = l.LayerNormalization(epsilon=1e-6)

    self.self_att = l.MultiHeadAttention(num_heads, embed_dim)
    self.enc_att = l.MultiHeadAttention(num_heads, embed_dim)

    self.self_dropout = l.Dropout(rate)
    self.enc_dropout = l.Dropout(rate)
    self.ffn_dropout = l.Dropout(rate)
    self.ffn = tf.keras.Sequential([
        l.Dense(feed_forward_dim, 'relu'),
        l.Dense(embed_dim)                   
    ])
  def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
      [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)
  
  def call(self, enc_out, target):
    input_shape = tf.shape(target)
    batch_size = input_shape[0]
    seq_len = input_shape[1]
    causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
    target_attn = self.self_att(target, target, attention_mask=causal_mask)
    target_norm = self.norm1(self.self_dropout(target_attn) + target)
    enc_out = self.enc_att(enc_out, target_norm)
    enc_out_norm = self.norm2(self.enc_dropout(enc_out) + target_norm)
    ffn_out = self.ffn(enc_out_norm)
    ffn_out_norm = self.norm3(self.ffn_dropout(ffn_out) + enc_out_norm)
    return ffn_out_norm

In [16]:
class Transformer(tf.keras.Model):
    def __init__(
        self,
        num_hid=128,
        num_head=2,
        num_feed_forward=256,
        source_maxlen=700,
        target_maxlen=700,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=10000,
    ):
        super().__init__()
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=source_maxlen, num_hid=num_hid
        )
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = tf.keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = l.Dense(num_classes)

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f'dec_layer_{i}')(enc_out, y)
        return y

    def call(self, inputs):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source)
        y = self.decode(x, target)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]
    
    def train_step(self, batch):
        source = batch['source']
        target = batch['target']
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = self.compiled_loss(dec_target, preds, sample_weight=mask)
        train_vars = self.trainable_variables
        gradients = tape.gradient(loss, train_vars)
        self.optimizer.apply_gradients(zip(gradients, train_vars))
        self.loss_metric.update_state(loss)
        return {'loss':self.loss_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}
    def generate(self, source, target_start_token_idx):
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input


In [17]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(128)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [None]:
model = Transformer()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, 
                                                      reduction='none')
model.compile(optimizer = optimizer, 
              loss = loss_object)
history = model.fit(dataset, epochs = 200)

In [23]:
for b in dataset:
    source = b['source']
    target = b['target']
    source = source[0]
    target = target[0][:-1]
    source = np.expand_dims(source, axis=0)
    target = np.expand_dims(target, axis=0)
    out = model([source, target])
    out = tf.argmax(out, axis=-1)
    print([idx2word_summary[i] for i in out.numpy()[0]])
    print([idx2word_summary[i] for i in target[0]])
    break

['apple', 'has', 'unveiled', 'a', 'new', 'low', 'cost', 'macintosh', 'computer', 'for', 'the', 'masses', 'billed', 'as', 'the', 'mac', 'mini', 'the', '499', 'macintosh', 'sold', 'for', '£339', 'in', 'the', 'uk', 'was', 'described', 'by', 'jobs', 'as', 'the', 'most', 'important', 'mac', 'made', 'by', 'apple', 'the', 'smaller', 'ipod', 'will', 'hold', 'about', '120', 'songs', 'said', 'mr', 'jobs', 'mr', 'jobs', 'also', 'unveiled', 'the', 'ipod', '<unk>', 'a', 'new', 'music', 'player', 'using', 'cheaper', 'flash', 'memory', 'rather', 'than', 'hard', 'drives', 'which', 'are', 'used', 'in', 'more', 'expensive', 'ipods', 'in', 'january', 'apple', 'sued', 'a', 'website', 'after', 'it', 'published', 'what', 'it', 'said', 'were', 'specifications', 'for', 'the', 'new', 'computer', 'ian', 'harris', 'deputy', 'editor', 'of', 'uk', 'magazine', 'mac', 'format', 'said', 'the', 'machine', 'would', 'appeal', 'to', 'pc', 'owning', 'consumers', 'who', 'had', 'purchased', 'an', 'ipod', 'the', 'new', 'comp

In [20]:
import pickle

with open('input_tokenizer.pickle', 'wb') as handle:
    pickle.dump(text_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('output_tokenizer.pickle', 'wb') as handle:
    pickle.dump(summary_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
model.save_weights('summary.h5')