# Text Summarization: Tfidf,TextRank,Sumy & Transformers

# Method 1: TF-IDF

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import textwrap
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Read dataset

def load_dataset(dataset_path):
    df = pd.read_csv(dataset_path)
    # Drop id column
    df.drop(columns=['id'],inplace=True)
    # Drop null values (if any)
    df.dropna(inplace=True)
    print("Number of records:",len(df))
    return df

In [5]:
# Load train data
df_train = load_dataset(r'/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
df_train.head()  

Number of records: 287113


Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [13]:
# Define function to wrap text

def wrap(x):
    return textwrap.fill(x,replace_whitespace=False,fix_sentence_endings=True)


print(wrap(df_train['article'][1]))

(CNN) -- Ralph Mata was an internal affairs lieutenant for the Miami-
Dade Police Department, working in the division that investigates
allegations of wrongdoing by cops.  Outside the office, authorities
allege that the 45-year-old longtime officer worked with a drug
trafficking organization to help plan a murder plot and get guns.  A
criminal complaint unsealed in U.S. District Court in New Jersey
Tuesday accuses Mata, also known as "The Milk Man," of using his role
as a police officer to help the drug trafficking organization in
exchange for money and gifts, including a Rolex watch.  In one
instance, the complaint alleges, Mata arranged to pay two assassins to
kill rival drug dealers.  The killers would pose as cops, pulling over
their targets before shooting them, according to the complaint.
"Ultimately, the (organization) decided not to move forward with the
murder plot, but Mata still received a payment for setting up the
meetings," federal prosecutors said in a statement.  The co

In [23]:
# Function to summarize text using Tfidf

stop_words = stopwords.words("english")
tfidf = TfidfVectorizer(stop_words=stop_words,norm='l1')

def get_sentence_score(tfidf_row): # Function to get score for each sentence
    x = tfidf_row[tfidf_row != 0]
    return x.mean()

def summarize(text):
    # Extract sentences
    sents = sent_tokenize(text)
    # perform tfidf
    X = tfidf.fit_transform(sents)
    # compute scores for each sentence
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        score = get_sentence_score(X[i,:])
        scores[i] = score
        
    # sort the scores
    sort_idx = np.argsort(-scores)
    
    # print summary
    print("Summary:")
    for i in sort_idx[:5]:
        print(wrap("%2f: %s"%(scores[i],sents[i])))

In [26]:
# Summary for a row in train data

summarize(df_train['article'][1])

Summary:
0.200000: CNN's Suzanne Presto contributed to this report.
0.166667: He is scheduled to appear in federal court in Florida on
Wednesday.
0.166667: If convicted, Mata could face life in prison.
0.142857: Since March 2010, he had been working in the internal
affairs division.
0.142857: Authorities arrested Mata on Tuesday in Miami Gardens,
Florida.


In [6]:
# Load validation data
df_val = load_dataset(r'/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv')
df_val.head()

Number of records: 13368


Unnamed: 0,article,highlights
0,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...
2,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
3,Avid rugby fan Prince Harry could barely watch...,Prince Harry in attendance for England's crunc...
4,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...


In [30]:
# Summary for 1st row

summarize(df_val['article'][1])

Summary:
0.250000: 'She has so much talent she is wasted in teaching.'
0.200000: She takes them home where she then presses them between the
pages of books.
0.142857: But locals who have had the luck of being able to see Wang's
art have been gobsmacked.
0.142857: She said: 'Some people are into capturing beauty through
photography, but for me, a digitalised image just isn't the same.
0.142857: 'I need to collect around 2000 leaves because this will
include losses'.


In [7]:
# Load test data
df_test = load_dataset(r'/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv')
df_test.head()

Number of records: 11490


Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [33]:
summarize(df_test['article'][0])

Summary:
0.166667: While most airlines stick to a pitch of 31 inches or above,
some fall below this.
0.142857: Ever noticed how plane seats appear to be getting smaller
and smaller?
0.125000: 'It is time that the DOT and FAA take a stand for humane
treatment of passengers.'
0.111111: They say that the shrinking space on aeroplanes is not only
uncomfortable - it's putting our health and safety in danger.
0.111111: The distance between two seats from one point on a seat to
the same point on the seat behind it is known as the pitch.


# Method 2: TextRank

In [39]:
# Create function to summarize

def summarize_textrank(text,factor=0.15):
    # extract sentences
    sents = sent_tokenize(text)
    # perform tfidf
    tfidf = TfidfVectorizer(stop_words=stop_words,norm='l1')
    X = tfidf.fit_transform(sents)
    
    # compute similarity matrix
    S = cosine_similarity(X)
    # Normalize
    S  = S/S.sum(axis=1,keepdims=True)
    
    # Uniform transition matrix
    U = np.ones_like(S)/len(S)
    
    # smmothed similarity matrix
    S = (1-factor) * S + factor * U
    
    # find the limiting/stationary distribution
    eigenvals,eigenvecs = np.linalg.eig(S.T)
    
    # compute scores
    scores = eigenvecs[:,0]/eigenvecs[:,0].sum()
    
    # sort the scores
    sort_idx = np.argsort(-scores)
    
    # print summary
    print("Summary:\n")
    for i in sort_idx[:5]:
        print(wrap("%.2f: %s"%(scores[i],sents[i])))

In [40]:
# Summary for row in train data

summarize_textrank(df_train['article'][1])

Summary:

0.07: Mata has worked for the Miami-Dade Police Department since 1992,
including directing investigations in Miami Gardens and working as a
lieutenant in the K-9 unit at Miami International Airport, according
to the complaint.
0.07: (CNN) -- Ralph Mata was an internal affairs lieutenant for the
Miami-Dade Police Department, working in the division that
investigates allegations of wrongdoing by cops.
0.07: A criminal complaint unsealed in U.S. District Court in New
Jersey Tuesday accuses Mata, also known as "The Milk Man," of using
his role as a police officer to help the drug trafficking organization
in exchange for money and gifts, including a Rolex watch.
0.06: The complaint also alleges that Mata used his police badge to
purchase weapons for drug traffickers.
0.06: Mata, according to the complaint, then used contacts at the
airport to transport the weapons in his carry-on luggage on trips from
Miami to the Dominican Republic.


# Method 3: Sumy

In [41]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting chardet (from breadability>=0.1.20->sumy)
  Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: br

In [42]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [45]:
# Create function for summary using TextRankSummarizer

def summarize_sumy_txtrnk(text):
    summarizer = TextRankSummarizer()
    parser = PlaintextParser.from_string(text,Tokenizer("english"))
    summary = summarizer(parser.document,sentences_count=5)
    for s in summary:
        print(wrap(str(s)))

In [46]:
# Summary for train dataset row 3

summarize_sumy_txtrnk(df_train['article'][2])

‘Mr Eccleston-Todd took the decision to pick up his mobile phone
whilst driving and, either reading or replying to this text message,
was so distracted that he failed to negotiate a left-hand bend,
crossing the central white line into the path of Miss Titley’s
oncoming car.
‘Miss Titley’s death in these circumstances reiterates the danger of
using a hand-held mobile phone whilst driving.’ Police were unable to
take breath or blood tests from Eccleston-Todd immediately, but in
tests several hours after the accident he was only marginally under
the drink-drive limit.
The judge agreed with police that he would have been over the limit at
the time his red Citroen hit Miss Titley’s blue Daihatsu Cuore on a
road near Yarmouth, Isle of Wight, on October 11, 2013.
'We weren't able to take breath or blood tests from him immediately
and although blood taken several hours after the collision showed he
was marginally under the limit, we maintain he would have been over
the limit at the time of the

In [47]:
# Create function for summary using LsaSummarizer

def summarize_sumy_lsa(text):
    summarizer = LsaSummarizer()
    parser = PlaintextParser.from_string(text,Tokenizer("english"))
    summary = summarizer(parser.document,sentences_count=5)
    for s in summary:
        print(wrap(str(s)))

In [48]:
# Summary for train dataset row 3

summarize_sumy_lsa(df_train['article'][2])

Craig Eccleston-Todd, 27 (left) was using his mobile phone when he
crashed head-on into the car being driven by Rachel Titley, 28
(right).
He was found guilty of causing death by dangerous driving at
Portsmouth Crown Court yesterday.
PC Mark Furse, from Hampshire constabulary’s serious collision
investigation unit, said: 'Our thoughts are with Rachel's family at
this time.
This case highlights just how tragic the consequences of committing
these offences can be.'
‘Mr Eccleston-Todd will now spend six years behind bars, but Rachel’s
family have lost her for ever.


# Method 4: Transformers (pipeline)

In [9]:
from transformers import pipeline

In [10]:
# Define pipeline

summarizer = pipeline(model='facebook/bart-large-cnn')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
# Summary on train data

def summary_trf(num):
    print('Original Article\n')
    print(wrap(df_train['article'][num]))
    print('\nSummary')
    result = summarizer(df_train['article'][num])
    return result[0]['summary_text']

In [15]:
summary_trf(1)

Original Article

(CNN) -- Ralph Mata was an internal affairs lieutenant for the Miami-
Dade Police Department, working in the division that investigates
allegations of wrongdoing by cops.  Outside the office, authorities
allege that the 45-year-old longtime officer worked with a drug
trafficking organization to help plan a murder plot and get guns.  A
criminal complaint unsealed in U.S. District Court in New Jersey
Tuesday accuses Mata, also known as "The Milk Man," of using his role
as a police officer to help the drug trafficking organization in
exchange for money and gifts, including a Rolex watch.  In one
instance, the complaint alleges, Mata arranged to pay two assassins to
kill rival drug dealers.  The killers would pose as cops, pulling over
their targets before shooting them, according to the complaint.
"Ultimately, the (organization) decided not to move forward with the
murder plot, but Mata still received a payment for setting up the
meetings," federal prosecutors said in a 

'Ralph Mata, 45, was an internal affairs lieutenant for the Miami-Dade Police Department. Authorities allege he worked with a drug trafficking organization to help plan a murder plot. The complaint also alleges that Mata used his police badge to purchase weapons for drug traffickers. Mata faces charges of aiding and abetting a conspiracy to distribute cocaine.'

# Method 5: Fine Tuning Transformers

In [8]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=44477e1f8f531c9ff6109aba4e704871f1a6c719d2381ad9e5e60f8fb5ed610e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.0 rouge_score-0.1.2


In [10]:
from transformers import pipeline
from transformers import AutoTokenizer
from datasets import load_dataset
import torch
import evaluate

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

**Load Dataset**

In [12]:
# Selecting a sample of datasets and converting it to transformers format

def create_dataset(dataframe,n,name):
    sample = dataframe.sample(n,ignore_index=True)
    # save file
    sample.to_csv(str(name)+'.csv',index=None)
    # Convert dataset format
    raw_data = load_dataset("csv",data_files=str(name)+'.csv')
    return raw_data

In [15]:
# train
raw_train = create_dataset(df_train,5000,'train')
raw_train

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-84d2ad24856fe97a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-84d2ad24856fe97a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights'],
        num_rows: 5000
    })
})

In [16]:
# validation
raw_val = create_dataset(df_val,3000,'val')
raw_val

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-017459f79cb2988c/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-017459f79cb2988c/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights'],
        num_rows: 3000
    })
})

**Pre-process**

In [17]:
# Define Tokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [18]:
# Function for tokenizing

prefix = "summarize: "

def preprocess_function(batch):
    inputs = [prefix + doc for doc in batch["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=batch["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
# Apply tokenizing function on train and validation datasets

tokenized_train = raw_train.map(preprocess_function,batched=True)
tokenized_val = raw_val.map(preprocess_function,batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [20]:
tokenized_train

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [21]:
tokenized_val

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [22]:
# Dynamic Padding

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

**Evaluate**

In [23]:
# Metric

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [24]:
# Function to compute metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

**Training**

In [25]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [26]:
# Training Arguments

training_args = Seq2SeqTrainingArguments(
output_dir='cnn_summarization',
evaluation_strategy='epoch',
learning_rate=2e-5,
per_device_train_batch_size=10,
per_device_eval_batch_size=10,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=4,
predict_with_generate=True,
fp16=True,
push_to_hub=True
)

In [28]:
# Trainer

trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train['train'],
eval_dataset=tokenized_val['train'],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)

/kaggle/working/cnn_summarization is already a clone of https://huggingface.co/deepaktripathy1/cnn_summarization. Make sure you pull the latest changes with `repo.git_pull()`.


In [29]:
# Train

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.760123,0.2428,0.1161,0.2005,0.2004,19.0
2,1.997900,1.73779,0.2428,0.1169,0.2013,0.2012,19.0
3,1.997900,1.732603,0.2431,0.1181,0.2018,0.2019,19.0
4,1.909900,1.731246,0.2432,0.118,0.202,0.202,19.0




TrainOutput(global_step=1000, training_loss=1.9538932495117187, metrics={'train_runtime': 1859.7284, 'train_samples_per_second': 10.754, 'train_steps_per_second': 0.538, 'total_flos': 5413672058880000.0, 'train_loss': 1.9538932495117187, 'epoch': 4.0})

**Understanding Rouge Metrics**

ROUGE1: Shared words
Number of words that appear in both model output, and expected output
Example: 0.5 means half of the words appear in both model output and expected output

ROUGE-2: Shared word-pairs
Number of word-pairs that appear in both model output, and expected output (as pairs)
Example: 0.5 means half of the adjacent word pairs appear in both model output and expected output.
This is a stricter metric than Rouge1, which is slightly more sensitive to the order

ROUGE-L: Longest shared word-sequence
The number of words appear in the exact same order. in both model output, and **expected
Example: 0.5 means half of the entire output of expected output is the same half of the entire expected text.
This metric is very sensitive to the generated order of words.

Since I deliberately kept the sample size to be extremely small for memory issues, the rouge score is just around 0.24 for Rouge1 and is also low for other rouge metrics. The score will also depend on the human written summary or the "highlights" in our case.

In [30]:
# Save model
import joblib
joblib.dump(model,"model.pkl")

['model.pkl']

**Predictions on sample from Test set**

In [59]:
# Preparing dataset of a sample of 50 records from test set

raw_test = create_dataset(df_test,50,'test')
raw_test

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e2b80bbd9a0c92e5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e2b80bbd9a0c92e5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights'],
        num_rows: 50
    })
})

In [60]:
# Preprocess

# Apply tokenizing function

tokenized_test = raw_test.map(preprocess_function,batched=True)
tokenized_test

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
})

In [64]:
# Create pipeline object

savedmodel = pipeline('summarization',
                      model='/kaggle/working/cnn_summarization/checkpoint-1000',
                      device=0)

In [66]:
# Predictions

outputs = savedmodel(tokenized_test['train']['article'],max_length=100)

In [69]:
print(raw_test['train']['article'][0])
outputs[0]

If you've always dreamed of seeing penguins in Antarctica or tracking the Siberian tiger in Russia - but gorilla trekking in Rwanda and spotting grizzlies in Alaska also feature on your to do list, this could be the safari for you. But before you don your khakis, you'll need a whopping £176,000 going spare to take part. The grand adventure promises to deliver up and close and personal moments with some of the world's most elusive and enchanting wildlife. Scroll down for video . The adventure promises to deliver up and close and personal moments with some of the world's most elusive and enchanting wildlife . To mark its 10th anniversary, Natural World Safaris has launched a new series of ‘Ultimate Safari’ adventures. Divided into four areas: Primates, Bears, Big Cats and Marine Life, the safari can be taken separately, but also as an epic world tour over 203 days, taking in 21 destinations. On each ‘Ultimate Safari’, guests will be guided across the globe on a carefully planned itinerar

{'summary_text': 'The Ultimate Bear Safari tracks polar bears, grizzly bears and spirit bears in Arctic Canada, the Arctic Circle, Alaska and British Columbia . Guests will be guided across the globe on a carefully planned itinerary, taking them through Africa, Asia and the Indian Subcontinent to the Polar Regions and Latin America for rare wildlife encounters . Each safari can be split into separate legs, or completed in one journey .'}

In [70]:
print(raw_test['train']['article'][1])
outputs[1]

(CNN)Sometimes the best ideas come from the bathroom. But Gaioz Nigalidze's ideas from the loo were a little too good. The Georgian chess grandmaster has been banned from the Dubai Open Chess Tournament after officials discovered he was darting to the toilet to consult his smartphone, which was logged onto a chess analysis app, the Dubai Chess and Culture Club said. Nigalidze's opponent, Tigran Petrosian of Armenia, grew suspicious when Nigalidze kept bolting to the restroom. "The Armenian noticed the Georgian was oddly frequenting the toilet after each move during a crucial part of the game," the Dubai Chess and Culture Club said. When officials first checked Nigalidze, they didn't find any device on him, the club said. But after looking into the bathroom stall he visited, they found the smartphone hidden in toilet paper. At first, Nigalidze claimed the smartphone wasn't his, the Dubai chess organization said. But the phone was logged on to a social media network under his account. "T

{'summary_text': "Gaioz Nigalidze has been banned from the Dubai Open Chess Tournament . The Georgian chess grandmaster's smartphone was logged on to an app . It's not clear how many times he went to the toilet during those matches ."}