# **Necessary Imports**

In [None]:
!pip install simpletransformers



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# **Import dataset**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/news_summary/news_summary.csv', encoding='latin-1')
data_more = pd.read_csv('/content/drive/MyDrive/news_summary/news_summary_more.csv', encoding='latin-1')
data = pd.concat([data, data_more], axis=0).reset_index(drop=True)
data.head(5)


Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [None]:
data = data[['headlines','text']]
data.columns = ['target_text', 'input_text']
data = data.dropna()

In [None]:
eval_df = data.sample(frac=0.2, random_state=101)
train_df = data.drop(eval_df.index)

In [None]:
train_df.shape, eval_df.shape

((82332, 2), (20583, 2))

In [None]:
train_df = train_df[0:2000]
eval_df = eval_df[0:200]

# **Loading T5 Model - Huggingface**

In [None]:
import logging

import pandas as pd
from simpletransformers.t5 import T5Model

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_df['prefix'] = "summarize"
eval_df['prefix'] = "summarize"


model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 128,
    "train_batch_size": 8,
    "num_train_epochs": 1,
}

# Create T5 Model
model = T5Model(
    "t5",
    "t5-base",args=model_args, use_cuda=False
)
#model = T5Model(model_name="t5", model_type = "t5-base", args=model_args, use_cuda=False)

# Train T5 Model on new task
model.train_model(train_df)

# Evaluate T5 Model on new task
results = model.eval_model(eval_df)


Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/2000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

INFO:simpletransformers.t5.t5_utils: Saving features into cached file cache_dir/t5-base_cached_1282000
INFO:simpletransformers.t5.t5_model: Training started


Using Adafactor for T5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.t5.t5_model: Training of t5-base model complete. Saved to outputs/.
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/200 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

INFO:simpletransformers.t5.t5_utils: Saving features into cached file cache_dir/t5-base_cached_128200


Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

INFO:simpletransformers.t5.t5_model:{'eval_loss': 1.6327518510818482}


# **Predictions**

In [None]:
random_num = 25
actual_title = eval_df.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+eval_df.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)

print(f'Actual Title: {actual_title}')
print(f'Predicted Title: {predicted_title}')
print(f'Actual Abstract: {actual_abstract}')

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Actual Title: 3 detained for beating cab driver to death in Mumbai
Predicted Title: ['Mumbai cops beat up taxi driver to death after taxi accident']
Actual Abstract: ["summarize: The Mumbai Police has detained three people for allegedly beating up a taxi driver to death on Monday after his taxi accidentally brushed past the accused's scooter and the driver almost fell off. The accused abused the victim, grabbed him by his collar and attacked him. The driver was declared brought dead after he was rushed to the hospital."]


In [None]:
test_sentence = ["M1 Macbooks aren’t that new anymore. Somehow, installing Python’s deep learning libraries still isn’t a straightforward process. At least with TensorFlow. PyTorch is different. Today you’ll learn how to install and run PyTorch natively on your M1 machine. It doesn’t make a difference which M1 machine you have (Air, Pro, Mini, or iMac). Let’s get started."]
model.predict(test_sentence)

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['No matter what M1 machine you have: TensorFlow']

In [None]:
test_sentence = ["more rains are expected in Chennai even as the city is reeling under the inundation and flooding caused by incessant rains on November 6. According to the India Meteorological Department, a low pressure area has formed over the southeast Bay of Bengal and the neighbourhood under the influence of cyclonic circulation. The IMD on Tuesday, November 9, gave a red warning for Tiruvannamalai, Villupuram, Chennai, Chengalpattu, Kanchipuram and Tiruvalur. An orange alert was given for Nilgiris, Coimbatore, Tiruppur, Dindigul, Salem, Kallakuruchi, Tirupattur, Vellore and Ranipet. Associated cyclonic circulation extends up to 4.5 km above mean sea level. It is very likely to concentrate into a Depression over the Southwest & adjoining Southeast Bay of Bengal during next 36 hours,” the IMD said. This, it added, was likely to reach the coast of north Tamil Nadu by the early hours of November 11. "]
model.predict(test_sentence)

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['No rains expected in Chennai in November due to incessant rains']

In [None]:
test_sentence = ["Elon Musk has of course been getting richer by the hour. His net worth was quoted this past week at a figure north of $300 billion. Not bad for someone who had to eke out an existence less than two years ago on a net worth estimated at a mere $20 billion."]
model.predict(test_sentence)

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['Elon Musk getting richer by the hour: Report']