In [None]:
!pip install transformers
import tensorflow as tf
import numpy as np
import re
import string
import pandas as pd
import tensorflow_datasets as tfds
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 3.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 10.6MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 18.2MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

## T5 Tokenizer and Config

In [None]:
# Bert Tokenizer (leverages SentencePiece and Unicode Normalizaiton)
tokenizer = T5Tokenizer.from_pretrained('t5-small')
# Start of sentence token
end_token = tokenizer.eos_token
# End of sentence token
start_token = tokenizer.pad_token

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




## Load Data

In [None]:
#Load Data from GDrive
news = pd.read_excel("/content/drive/My Drive/news.xlsx")
news.drop(['Source ', 'Time ', 'Publish Date'], axis=1, inplace=True)
print(news.head(100))

                                             Headline                                              Short
0   4 ex-bank officials booked for cheating bank o...  The CBI on Saturday booked four former officia...
1      Supreme Court to go paperless in 6 months: CJI  Chief Justice JS Khehar has said the Supreme C...
2   At least 3 killed, 30 injured in blast in Sylh...  At least three people were killed, including a...
3   Why has Reliance been barred from trading in f...  Mukesh Ambani-led Reliance Industries (RIL) wa...
4   Was stopped from entering my own studio at Tim...  TV news anchor Arnab Goswami has said he was t...
..                                                ...                                                ...
95  Houseboat owners should not pollute Dal lake: ...  Terming the Dal Lake as a &#34;treasure&#34; f...
96  Will lions in UP zoos have to live on palak pa...  Speaking about the issue of meat shortage in U...
97  Delhi MCD elections a chance to uproot AAP gov...  

## Creating Training Examples

In [None]:
# Clean Text
def preprocessText(text):
    #remove content into parenthesis
    text = re.sub(r'\([^)]*\)', '', text)
    #remove quotes 
    text= re.sub('"','', text)
    #delete whitespaces
    text =  " ".join(text.split())
    return text

In [None]:
max_len_doc = -1
max_len_sum = -1

# Keep all training inputs into a dictionary.
train_data = {
    'input_ids': [],
    'attention_mask':[],
    'decoder_inputs_ids':[],
    "decoder_attention_mask":[]
}
# Preprocess and Tokenize
for i in news['Short']:
    # Clean text 
    trainText = preprocessText(i)
    # From text to tensor
    ids= tokenizer.encode_plus(trainText)
    # Find longest text
    if (max_len_doc < len(ids['input_ids'])):
        max_len_doc = len(ids['input_ids'])
    # Get input tokens 
    train_data['input_ids'].append(ids['input_ids'])
    # Get attention mask
    train_data['attention_mask'].append(ids['attention_mask'])

for i in news['Headline']:
    # Add start token <pad> in front of summary
    labelsText= start_token+ " "+ preprocessText(i)
    decoder_ids = tokenizer.encode_plus(labelsText)
    if(max_len_sum < len(decoder_ids['input_ids'])):
        max_len_sum = len(decoder_ids['input_ids'])
    train_data['decoder_inputs_ids'].append(decoder_ids['input_ids'])
    train_data['decoder_attention_mask'].append(decoder_ids['attention_mask'])   
#Convert to array of lists 
for key in train_data:
    train_data[key]= np.array(train_data[key])   
#Pad sequence to max len   
train_data['input_ids'] = tf.keras.preprocessing.sequence.pad_sequences(train_data['input_ids'], maxlen=
                                              max_len_doc, padding= 'post', truncating='post' )
train_data['attention_mask'] = tf.keras.preprocessing.sequence.pad_sequences(train_data['attention_mask'], maxlen=
                                              max_len_doc, padding= 'post', truncating='post')
train_data['decoder_inputs_ids'] = tf.keras.preprocessing.sequence.pad_sequences(train_data['decoder_inputs_ids'], maxlen=
                                              max_len_sum, padding= 'post', truncating='post')
train_data['decoder_attention_mask'] = tf.keras.preprocessing.sequence.pad_sequences(train_data['decoder_attention_mask'], maxlen=
                                              max_len_sum, padding= 'post', truncating='post')

In [None]:
print ("Number of training examples: ", len(train_data['input_ids'])
print ("Max length of tokens of main text:" max_len_doc)
print ("Max length of token of sumamry: ", max_len_sum)


## Create Keras Model

In [None]:
def createModel ():
    # T5 Model - Hugging Face
    T5 = TFT5ForConditionalGeneration.from_pretrained('t5-small')
    task_specific_params = T5.config.task_specific_params
    if task_specific_params is not None:
        T5.config.update(task_specific_params.get("summarization", {}))
    # Inputs
    input_ids = tf.keras.Input(shape=(max_len_doc,),dtype=tf.int32)
    decoder_inputs_ids = tf.keras.Input(shape=(max_len_sum-1,),dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=(max_len_doc,),dtype=tf.int32)
    decoder_attention_mask = tf.keras.Input(shape=(max_len_sum-1,),dtype=tf.int32)
    # Get T5 output
    logits = T5(input_ids, attention_mask = attention_mask, 
                decoder_input_ids=decoder_inputs_ids,
                decoder_attention_mask= decoder_attention_mask)[0]
    # return Keras model            
    return tf.keras.Model(inputs= [input_ids, attention_mask, 
                                   decoder_inputs_ids,decoder_attention_mask],
                           outputs=logits)


## HyperParameters

In [None]:
BATCH_SIZE = 64
EPOCHS  = 4
LEARNING_RATE = 3e-5

## Use TPU

In [None]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = createModel()
else:
    model = createModel()

model.summary()





INFO:tensorflow:Initializing the TPU system: grpc://10.25.112.194:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.25.112.194:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
All model checkpoint weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Model: "tf_t5for_conditional_generation_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  16449536  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  18881280  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  25176064  
Total params: 60,506,880
Trainable params: 60,506,880
Non-trainable params: 0
_________________________________________________________________
None
Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 218)]        0                                            
___________________________

## Train Model

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
adam_opt = tf.optimizers.Adam (learning_rate= LEARNING_RATE)
model.compile(adam_opt, loss= loss, metrics=[loss])
model.fit(x=[train_data['input_ids'],
             train_data['attention_mask'],
             train_data['decoder_inputs_ids'][:,:-1],
             train_data['decoder_attention_mask'][:,:-1]],
          y= train_data['decoder_inputs_ids'][:,1:],
          batch_size= BATCH_SIZE, 
          epochs=EPOCHS,
          verbose=2)
          

Epoch 1/4




861/861 - 82s - loss: 1.0779 - sparse_categorical_crossentropy: 1.0779
Epoch 2/4
861/861 - 67s - loss: 0.6167 - sparse_categorical_crossentropy: 0.6167
Epoch 3/4
861/861 - 67s - loss: 0.5675 - sparse_categorical_crossentropy: 0.5675
Epoch 4/4
861/861 - 67s - loss: 0.5373 - sparse_categorical_crossentropy: 0.5373


<tensorflow.python.keras.callbacks.History at 0x7f8ddb3e2e80>


## Test Model using Greedy Algorithm

In [None]:
def getSummary(text):
    # Start summary with <pad> token
    summary = start_token
    # Preprocess text
    text = preprocessText(text)
    # Convert text to tensor
    ids = tokenizer.encode_plus(text)
    input_ids = ids['input_ids']
    attention_mask = ids['attention_mask']
    # Pad text sequence
    input_ids = tf.keras.preprocessing.sequence.pad_sequences([input_ids], maxlen=
                                                max_len_doc, padding= 'post', truncating='post' )
    attention_mask = tf.keras.preprocessing.sequence.pad_sequences([attention_mask], maxlen=
                                                max_len_doc, padding= 'post', truncating='post')
    counter = 1 
    prev_summary =""
    while (counter < max_len_sum and len(prev_summary)!=len(summary)):
        # Convert summary to 
        decoder_ids = tokenizer.encode_plus(summary)
        decoder_input_ids = decoder_ids['input_ids']
        decoder_attention_mask = decoder_ids['attention_mask']
        #Pad sequence to max len   
        decoder_inputs_ids = tf.keras.preprocessing.sequence.pad_sequences([decoder_input_ids[:-1]], maxlen=
                                                max_len_sum, padding= 'post', truncating='post')
        decoder_attention_mask = tf.keras.preprocessing.sequence.pad_sequences([decoder_attention_mask[:-1]], maxlen=
                                                max_len_sum, padding= 'post', truncating='post')
        # Decoder of T5 predicts the next word
        pred = model.predict([input_ids,attention_mask, decoder_inputs_ids, decoder_attention_mask])
        # Decode text 
        new_summary = tokenizer.decode(np.argmax(pred, axis=-1)[0,:counter])
        prev_summary = summary
        # Get new summary and prepare it for the next prediction
        summary = start_token +" "+ new_summary
        counter+=1
    #remove <pad> token
    return re.sub(r'<pad>',"",summary)



In [None]:
getSummary("With your permission we and our partners may use precise geolocation\
  data and identification through device scanning. You may click to consent to our\
  and our partners’ processing as described above. Alternatively you may access more\
   detailed information and change your preferences before consenting or to refuse consenting.")

' We may use precise geolocation data through device scanning'