In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.train import Checkpoint, CheckpointManager
from tensorflow.keras.metrics import Mean

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive


/content/drive/MyDrive


### Loading Data

In [None]:
data = pd.read_excel('/content/drive/MyDrive/Inshorts Cleaned Data.xlsx', sheet_name='Sheet1', usecols=[0,1])

In [None]:
data['Short'][100]

'Indian spinner Ravichandran Ashwin has broken the record for most wickets in a single Test season, taking his 79th this season during the Dharamsala Test against Australia on Saturday. Ashwin went past South African pacer Dale Steyn, who had claimed 78 wickets in 12 Tests in 2007-08. Ashwin has taken seven five-wicket hauls this season in 13 matches.'

In [None]:
import unicodedata
def clean_words(sentence):
    sentence = str(sentence).lower()
    sentence = unicodedata.normalize('NFKD', sentence).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # URL'leri temizle
    sentence = re.sub(r"http[s]?://\S+", "", sentence)

    # Özel karakterler ve rakamlar
    sentence = re.sub(r"[\[\]\\0-9()\"$#%/@;:<>{}`+=~|.!?,-]|\bcnn\b", "", sentence)

    # Diğer özel karakterler
    sentence = re.sub(r"[&…•♦◆★☆■□▪▫▶◀▲▼]", "", sentence)

    # Ekstra boşlukları temizle
    sentence = re.sub(r"\s+", " ", sentence)

    # Yeni satırları temizle
    sentence = re.sub(r"\\n", "", sentence)

    # Baş ve sondaki boşlukları kaldır
    sentence = sentence.strip()

    return sentence

In [None]:
news = data['Short'].apply(clean_words)
summary = data['Headline'].apply(clean_words)

In [None]:
news[100], summary[100]

('indian spinner ravichandran ashwin has broken the record for most wickets in a single test season taking his th this season during the dharamsala test against australia on saturday ashwin went past south african pacer dale steyn who had claimed wickets in tests in ashwin has taken seven fivewicket hauls this season in matches',
 'ashwin breaks record for most wickets in a test season')

### Preprocessing

In [None]:
def add_tokens(x):
    return '<START> ' + x + ' <END>'

summary = summary.apply(add_tokens)
print("Summary after adding tokens:")
print(summary)

Summary after adding tokens:
0        <START> exbank officials booked for cheating b...
1        <START> supreme court to go paperless in month...
2        <START> at least killed injured in blast in sy...
3        <START> why has reliance been barred from trad...
4        <START> was stopped from entering my own studi...
                               ...                        
55099    <START> sensex loses points to hit week low <END>
55100    <START> china to inject bn into the money mark...
55101    <START> ghulam ali set to make acting debut in...
55102    <START> is acknowledges death of jihadi john r...
55103    <START> cairn to seek mn from india in damages...
Name: Headline, Length: 55104, dtype: object


#### Tokenizing the texts into integer tokens

In [None]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'

news_tokenizer = Tokenizer(oov_token=oov_token)
summary_tokenizer = Tokenizer(filters=filters, oov_token=oov_token)

news_tokenizer.fit_on_texts(news)
summary_tokenizer.fit_on_texts(summary)

inputs = news_tokenizer.texts_to_sequences(news)
targets = summary_tokenizer.texts_to_sequences(summary)

In [None]:
summary_tokenizer.texts_to_sequences(["This is a test"])

[[153, 20, 9, 62]]

In [None]:
summary_tokenizer.sequences_to_texts([[153, 20, 9, 62]])

['this is a test']

In [None]:
encoder_vocab_size = len(news_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1

# vocab_size
encoder_vocab_size, decoder_vocab_size

(81290, 30897)

#### Obtaining insights on lengths for defining maxlen

In [None]:
news_lengths = []
summary_lengths = []

for n,s in zip(news, summary):
    news_lengths.append(len(n))
    summary_lengths.append(len(s))



news_lengths = pd.Series(news_lengths)
summary_lengths = pd.Series(summary_lengths)


In [None]:
news_lengths.describe()

Unnamed: 0,0
count,55104.0
mean,342.000036
std,25.969631
min,206.0
25%,325.0
50%,343.0
75%,361.0
max,408.0


In [None]:
summary_lengths.describe()

Unnamed: 0,0
count,55104.0
mean,61.665251
std,6.731527
min,15.0
25%,57.0
50%,61.0
75%,67.0
max,79.0


In [None]:
encoder_maxlen = 400
decoder_maxlen = 75

#### Padding/Truncating sequences for identical sequence lengths

In [None]:
inputs = pad_sequences(inputs, maxlen=encoder_maxlen, padding='post', truncating='post')
targets = pad_sequences(targets, maxlen=decoder_maxlen, padding='post', truncating='post')

### Creating ds pipeline

In [None]:
inputs = tf.cast(inputs, dtype=tf.int32)
targets = tf.cast(targets, dtype=tf.int32)

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [None]:
ds = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [None]:
print("Dataset created:")
print("Number of batches in the dataset:", tf.data.experimental.cardinality(ds))
print("Batch size:", BATCH_SIZE)

Dataset created:
Number of batches in the dataset: tf.Tensor(861, shape=(), dtype=int64)
Batch size: 64


### Positional Encoding for adding notion of position among words as unlike RNN this is non-directional

In [None]:
def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return position * angle_rates

In [None]:
def positional_encoding(position, d_model):
    position = np.arange(position)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]

    exponent = (2 * (i // 2)) / np.float32(d_model)
    angle_rates = 1 / np.power(10000, exponent)
    angle_rads = position * angle_rates
    angle_rads[:, ::2] = np.sin(angle_rads[:, ::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.expand_dims(angle_rads, axis=0)
    pos_encoding = tf.cast(pos_encoding, dtype=tf.float32)

    return pos_encoding

### Masking

- Padding mask for masking "pad" sequences
- Lookahead mask for masking future words from contributing in prediction of current words in self attention

In [None]:
def create_padding_mask(seq):
    padding_mask = tf.math.equal(seq, 0)
    padding_mask = tf.cast(padding_mask, tf.float32)

    padding_mask = tf.expand_dims(padding_mask, axis=1)
    padding_mask = tf.expand_dims(padding_mask, axis=2)
    return padding_mask

def create_look_ahead_mask(size):
    ones = tf.ones((size, size))

    req_matrix = tf.linalg.band_part(ones, -1, 0)
    toggle_req_matrix = 1 - req_matrix
    mask = toggle_req_matrix
    return mask

### Building the Model

#### Scaled Dot Product

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled, axis=-1)

    values = tf.matmul(attention_weights, v)
    return values, attention_weights

#### Multi-Headed Attention

In [None]:
class MultiHeadAttention(Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads

        self.head_dim = d_model // self.num_heads

        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)

        self.linear_dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))

        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        values, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        values = tf.transpose(values, perm=[0, 2, 1, 3])
        concat_values = tf.reshape(values, (batch_size, -1, self.d_model))

        output = self.linear_dense(concat_values)

        return output, attention_weights

### Feed Forward Network

In [None]:
def point_wise_feed_forward_network(d_model, hidden):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(hidden, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

In [None]:
class PointwiseFeedForward(Layer):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PointwiseFeedForward, self).__init__()
        self.linear1 = Dense(hidden, activation='relu')
        self.linear2 = Dense(d_model)
        self.dropout = Dropout(rate=drop_prob)

    def call(self, x):
        x = self.linear1(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

#### Fundamental Unit of Transformer encoder

In [None]:
class EncoderLayer(Layer):
    def __init__(self, d_model, num_heads, hidden, rate=0.1):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.ffn = PointwiseFeedForward(d_model, hidden, drop_prob=rate)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout2 = Dropout(rate)

    def call(self, x, *, training=False, mask=None):
        residual = x
        # self‐attention
        x, _ = self.attention(x, x, x, mask)
        x = self.dropout1(x, training=training)
        x = self.norm1(residual + x)

        residual = x
        # feed‐forward
        x = self.ffn(x)
        x = self.dropout2(x, training=training)
        x = self.norm2(residual + x)
        return x



#### Fundamental Unit of Transformer decoder

In [None]:
class DecoderLayer(Layer):
    def __init__(self, d_model, num_heads, hidden, rate=0.1):
        super(DecoderLayer, self).__init__()
        self.attention1 = MultiHeadAttention(d_model, num_heads)
        self.attention2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = PointwiseFeedForward(d_model, hidden, drop_prob=rate)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.norm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, x, enc_output, *, training=False, look_ahead_mask=None, padding_mask=None):
        # 1. masked self-attention
        attn1, attn_weights_block1 = self.attention1(
            x, x, x, look_ahead_mask
        )
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.norm1(attn1 + x)

        # 2. encoder-decoder attention
        attn2, attn_weights_block2 = self.attention2(
            out1, enc_output, enc_output, padding_mask
        )
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.norm2(attn2 + out1)

        # 3. feed-forward
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.norm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2


#### Encoder consisting of multiple EncoderLayer(s)

In [None]:
class Encoder(Layer):
    def __init__(self,  d_model, num_layers, num_heads, hidden,
                 input_vocab_size, max_pos_encoding, rate=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_pos_encoding, d_model)
        self.enc_layers = [
            EncoderLayer(d_model, num_heads, hidden, rate)
            for _ in range(num_layers)
        ]
    def call(self, x, *, training=False, mask=None):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        for enc_layer in self.enc_layers:
            x = enc_layer(x, training=training, mask=mask)

        return x


#### Decoder consisting of multiple DecoderLayer(s)

In [None]:
class Decoder(Layer):
    def __init__(self, d_model, num_layers, num_heads, hidden,
                 target_vocab_size, max_pos_encoding, rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_pos_encoding, d_model)
        self.dec_layers = [
            DecoderLayer(d_model, num_heads, hidden, rate)
            for _ in range(num_layers)
        ]
        self.dropout = Dropout(rate)

    def call(self, x, enc_output, *, training=False, look_ahead_mask=None, padding_mask=None):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)

        for i, dec_layer in enumerate(self.dec_layers):
            x, block1, block2 = dec_layer(
                x,
                enc_output,
                training=training,
                look_ahead_mask=look_ahead_mask,
                padding_mask=padding_mask
            )
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2

        return x, attention_weights


#### Finally, the Transformer

In [None]:
class Transformer(Model):
    def __init__(self, d_model, num_layers, num_heads, hidden,
                 input_vocab_size, target_vocab_size,
                 max_pos_input, max_pos_target, rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(d_model, num_layers, num_heads, hidden,
                               input_vocab_size, max_pos_input, rate)
        self.decoder = Decoder(d_model, num_layers, num_heads, hidden,
                               target_vocab_size, max_pos_target, rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    # <-- Burada * ekleyip tüm non-tensor argümanları keyword-only yaptık
    def call(
        self,
        inp,
        tar,
        *,
        training=False,
        enc_padding_mask=None,
        look_ahead_mask=None,
        dec_padding_mask=None
    ):
        # encoder ve decoder çağrılarını da keyword argümanlarla yapıyoruz
        enc_output = self.encoder(
            inp,
            training=training,
            mask=enc_padding_mask
        )
        dec_output, attention_weights = self.decoder(
            tar,
            enc_output,
            training=training,
            look_ahead_mask=look_ahead_mask,
            padding_mask=dec_padding_mask
        )
        final_output = self.final_layer(dec_output)
        return final_output, attention_weights


### Training

In [None]:
# hyper-params
num_layers = 4
d_model = 128
hidden = 512
num_heads = 8
EPOCHS = 40

#### Adam optimizer with custom learning rate scheduling

In [None]:
class CustomSchedule(LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000): # lrate = d_model^-0.5 * min(step_num^-0.5, step_num * warmup_steps^-1.5)
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step* (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


#### Defining losses and other metrics

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

#### Transformer

In [None]:
transformer = Transformer(
    d_model,
    num_layers,
    num_heads,
    hidden,
    encoder_vocab_size,
    decoder_vocab_size,
    max_pos_input=encoder_vocab_size,
    max_pos_target=decoder_vocab_size,
)

#### Masks

In [None]:
def create_masks(inputs, targets):
    enc_padding_mask = create_padding_mask(inputs)
    dec_padding_mask = create_padding_mask(inputs)

    look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])
    dec_target_padding_mask = create_padding_mask(targets)

    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

#### Checkpoints

In [None]:
checkpoint_path = "checkpoints2"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!')


#### Training steps

In [None]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        # Pass keyword arguments explicitly
        predictions, _ = transformer(
    inp,
    tar_inp,
    training=True,
    enc_padding_mask=enc_padding_mask,
    look_ahead_mask=combined_mask,
    dec_padding_mask=dec_padding_mask
)

        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_state()
    count = 0
    for (batch, (inp, tar)) in enumerate(ds):
        train_step(inp, tar)

        if batch % 429 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 10.3489
Epoch 1 Batch 429 Loss 9.2819
Epoch 1 Batch 858 Loss 8.3650
Epoch 1 Loss 8.3621
Time taken for 1 epoch: 104.59404969215393 secs

Epoch 2 Batch 0 Loss 7.3125
Epoch 2 Batch 429 Loss 6.9981
Epoch 2 Batch 858 Loss 6.7640
Epoch 2 Loss 6.7628
Time taken for 1 epoch: 72.33458352088928 secs

Epoch 3 Batch 0 Loss 6.5978
Epoch 3 Batch 429 Loss 6.2346
Epoch 3 Batch 858 Loss 6.0809
Epoch 3 Loss 6.0804
Time taken for 1 epoch: 72.33625173568726 secs

Epoch 4 Batch 0 Loss 5.8739
Epoch 4 Batch 429 Loss 5.7267
Epoch 4 Batch 858 Loss 5.6128
Epoch 4 Loss 5.6118
Time taken for 1 epoch: 72.33713245391846 secs

Epoch 5 Batch 0 Loss 5.3210
Epoch 5 Batch 429 Loss 5.2753
Epoch 5 Batch 858 Loss 5.1803
Saving checkpoint for epoch 5 at checkpoints2/ckpt-1
Epoch 5 Loss 5.1797
Time taken for 1 epoch: 73.44601631164551 secs

Epoch 6 Batch 0 Loss 5.1082
Epoch 6 Batch 429 Loss 4.8235
Epoch 6 Batch 858 Loss 4.7262
Epoch 6 Loss 4.7260
Time taken for 1 epoch: 72.33873295783997 secs

Epoch 7 B

### Inference

#### Predicting one word at a time at the decoder and appending it to the output; then taking the complete sequence as an input to the decoder and repeating until maxlen or stop keyword appears

In [None]:
def evaluate(input_news):
    input_news = news_tokenizer.texts_to_sequences([input_news])
    input_news = tf.keras.preprocessing.sequence.pad_sequences(input_news, maxlen=encoder_maxlen, padding='post', truncating='post')

    encoder_input = tf.expand_dims(input_news[0], 0)

    decoder_input = [summary_tokenizer.word_index['<start>']]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(decoder_maxlen):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = transformer(
            encoder_input,
            output,
            training=False, #Passing as a Keyword Argument
            enc_padding_mask=enc_padding_mask,
            look_ahead_mask=combined_mask,
            dec_padding_mask=dec_padding_mask
        )

        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == summary_tokenizer.word_index['<end>']:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights


In [None]:
def summarize(input_news):
    summarized = evaluate(input_news=input_news)[0].numpy()
    summarized = np.expand_dims(summarized[1:], 0)
    return summary_tokenizer.sequences_to_texts(summarized)[0]

In [None]:
def checkinbulk(randomnumber):
  print('Actual summary:', summarize(news[randomnumber]))

  print('News: ', news[randomnumber])
  print('Actual summary: ', summary[randomnumber][7:-6])

In [None]:
import random
random_number = random.randint(0, 55104)

checkinbulk(random_number)

Actual summary: hp announces price hike across india
News:  american technology company hp has announced an increase in the list prices of its products by across categories in the indian market hp said the hike is in line with its move to adjust to currency movement and commodity prices globally the company leads the indian pc market with an overall market share of
Actual summary:   hp announces price hike across products in india
