In [1]:
# !pip install transformers
# !pip install rouge
# !pip install sentencepiece
# !pip install nomkl
# !pip install datasets
# !pip install pytorch

In [2]:
import os
import time
import numpy as np
import pandas as pd
import collections

import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, TFT5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import Dataset
from rouge import Rouge
import nltk.data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/futureperfect6/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Some functions for convenience later

In [3]:
def format_story(filename):
    """Given the CNN data file, reformats to separate the story from the highlights.
    Highlights are returned as a single string"""
    
    file = open(filename,'r')
    text = file.read()
    
    # split the story and highlights
    split_text = text.split('\n\n@highlight\n\n')
    story = split_text[0]
    highlights = split_text[1:]
    
    # return both
    return story, highlights#'. '.join(highlights)+'.'

def cos_sims(out_sent, ref_sents):
    "gets cosine similarities for an output sentence with respect to the highlight sentences. Returns the sum of values."
    
    vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
    
    # get sentence level vectors with tf-idf
    tfidf = vect.fit_transform([out_sent] + ref_sents)
    
    # get similarity matrix
    similarity_mat = tfidf * tfidf.T
    
    # only values comparing "out_sent" with each sent in "ref_sents"
    return similarity_mat.toarray()[:1,1:][0]

## Load a T5 Tokenizer and Model

In [4]:
# t5 model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# t5 tokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-base')

def tokenize(batch):
    """Applies tokenizer to a whole dataset at once. Input is a dataset with raw text data, 
    and output is a dataset with tokenized data"""
    
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True)
    tokenized_input['labels'] = tokenized_label['input_ids']
    return tokenized_input

In [5]:
# load metadata
cnn_meta = pd.read_csv('cnn_meta.csv')
print(len(cnn_meta.index),'rows')
cnn_meta.head()

92579 rows


Unnamed: 0.1,Unnamed: 0,story,train,validation,test,duplicate,source,highlights,broken
0,0,0001d1afc246a7964130f43ae940af6bc6c57f01.story,1,0,0,0,0,4,0
1,1,0002095e55fcbd3a2f366d9bf92a95433dc305ef.story,1,0,0,0,0,4,0
2,2,00027e965c8264c35cc1bc55556db388da82b07f.story,1,0,0,0,0,3,0
3,3,0002c17436637c4fe1837c935c04de47adb18e9a.story,1,0,0,0,0,4,0
4,4,0003ad6ef0c37534f80b55b4235108024b407f0b.story,1,0,0,0,0,3,0


## Data Formatting

Right now, we have a text file for each story. T5 requires a single matrix (a dataset object is perfect) with source/target columns.

In [6]:
source_text_train = []
target_text_train = []

# get list of training files
train_files = cnn_meta[cnn_meta['train']==1].reset_index()['story']

start = time.time()

# for i in range(len(train_files)):
for i in range(100):
    
    # get formatted input and target
    story, highlights = format_story('ernesto/cnn_stories_tokenized/'+ train_files[i+10])
    
    # format data as story/single highlight pairs
    for j in range(len(highlights)):
        source_text_train.append('\n\n@highlight\n\n'.join([story]+highlights[:j]+highlights[j+1:]))
        target_text_train.append(highlights[j])
    
    if (i+1)%100 == 0:
        print(i+1, "passed", end = ', ')

# print the time this took in minutes
print('\n\ntime:', (time.time()-start)/60,'minutes')

train_df = pd.DataFrame(list(zip(source_text_train, target_text_train)),columns =['source', 'target'])
train_dataset = Dataset.from_pandas(train_df)
print(train_dataset)
train_df.head()

100 passed, 

time: 0.2814207275708516 minutes
Dataset({
    features: ['source', 'target'],
    num_rows: 357
})


Unnamed: 0,source,target
0,-LRB- CNN -RRB- -- If you travel by plane and ...,Hawaiian Airlines again lands at No. 1 in on-t...
1,-LRB- CNN -RRB- -- If you travel by plane and ...,The Airline Quality Rankings Report looks at t...
2,-LRB- CNN -RRB- -- If you travel by plane and ...,ExpressJet and American Airlines had the worst...
3,-LRB- CNN -RRB- -- If you travel by plane and ...,Virgin America had the best baggage handling ;...
4,-LRB- CNN -RRB- For the second time during his...,The 15 new cardinals will be installed on Febr...


In [21]:
print('='*60+'\nEXAMPLE INPUT TEXT\n'+'='*60)
print(source_text_train[0])
print('='*60+'\nEXAMPLE TARGET TEXT\n'+'='*60)
print(target_text_train[0])

EXAMPLE INPUT TEXT
-LRB- CNN -RRB- -- If you travel by plane and arriving on time makes a difference , try to book on Hawaiian Airlines . In 2012 , passengers got where they needed to go without delay on the carrier more than nine times out of 10 , according to a study released on Monday .

In fact , Hawaiian got even better from 2011 , when it had a 92.8 % on-time performance . Last year , it improved to 93.4 % .

The Airline Quality Rankings Report looks at the 14 largest U.S. airlines and is based on an analysis of U.S. Department of Transportation figures . It 's co-authored by Brent Bowen , the head of the Department of Aviation Technology at Purdue University , and Dean Headley of Wichita State .

In addition to on-time performance , the joint project looks at three other categories : rate of consumer complaints , mishandled bags and denied boarding performance .

At a time when U.S. airlines are a whipping post for passenger complaints about crowded flights , tight seats , costl

In [7]:
source_text_val = []
target_text_val = []

# get list of validation files
val_files = cnn_meta[cnn_meta['validation']==1].reset_index()['story']

start = time.time()

# for i in range(len(val_files)):
for i in range(20):
    
    # get formatted input and target
    story, highlights = format_story('ernesto/cnn_stories_tokenized/'+ val_files[i])
    
    for j in range(len(highlights)):
        source_text_val.append('\n\n@highlight\n\n'.join([story]+highlights[:j]+highlights[j+1:]))
        target_text_val.append(highlights[j])
    
#     source_text.append(tokenizer('summarize: ' + story, return_tensors='tf').input_ids)
#     target_text.append(tokenizer(highlights, return_tensors='tf').input_ids)
    
    if (i+1)%100 == 0:
        print(i+1, "passed", end = ', ')

# print the time this took in minutes
print('\n\ntime:', (time.time()-start)/60,'minutes')

val_df = pd.DataFrame(list(zip(source_text_val, target_text_val)),columns =['source', 'target'])
val_dataset = Dataset.from_pandas(val_df)
print(val_dataset)



time: 0.0857517917950948 minutes
Dataset({
    features: ['source', 'target'],
    num_rows: 50
})


# Let's get some baseline loss values

In [8]:
# check some stories for pre-training loss
for i in range(5):
    
    # get formatted input and target
    story, highlights = format_story('ernesto/cnn_stories_tokenized/'+val_files[i])
    
    #train the model
    input_ids = tokenizer('summarize: ' + '\n\n@highlight\n\n'.join([story]+highlights[1:]),
                          return_tensors='pt').input_ids
    labels = tokenizer(highlights[0], return_tensors='pt').input_ids
    
    # compute loss (this returns an array of things)
    loss = model(input_ids=input_ids, labels=labels).loss
    
    # print loss (sum of array values above)
    print(loss)

Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors


tensor(1.6362, grad_fn=<NllLossBackward>)
tensor(3.1221, grad_fn=<NllLossBackward>)
tensor(4.4120, grad_fn=<NllLossBackward>)
tensor(4.4682, grad_fn=<NllLossBackward>)
tensor(4.3808, grad_fn=<NllLossBackward>)


## Tokenize Data

In [9]:
train_tokenized = train_dataset.map(tokenize, batched=True, batch_size=512)
val_tokenized = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [10]:
val_tokenized

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'source', 'target'],
    num_rows: 50
})

In [11]:
start = time.time()

output_dir = 'input_highlights_model'

# training arguments to feed to Trainer object
training_args = TrainingArguments(
    output_dir = output_dir, # trained model will be saved here
    num_train_epochs = 2,
    per_device_train_batch_size = 8, # number of examples per batch
    per_device_eval_batch_size = 8, # number of examples per batch
    eval_accumulation_steps = 1,
    prediction_loss_only = True,
    learning_rate = 0.001,
    evaluation_strategy = 'steps',
    save_steps = 10,
    save_total_limit = 1,
    remove_unused_columns = True,
    run_name = 'run_name',
    logging_steps = 500, # print loss after this many steps
    eval_steps = 500, # calculate loss after this many steps
    logging_first_step = False,
    load_best_model_at_end = True,
    metric_for_best_model = "loss", 
    greater_is_better = False
)

# create Trainer to feed the train/dev data
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_tokenized,
    eval_dataset = val_tokenized
)

# train the model and save it to our directory
trainer.train()
trainer.save_model(output_dir + '/model')

# print the time this took in minutes
print('\n\ntime:', (time.time()-start)/60, 'minutes')

Step,Training Loss,Validation Loss




time: 10.137892361481985 minutes


# How did Training affect the loss?

In [12]:
### Check the same stories as before and pray the loss has decreased ###

# load our model
baseline_model = T5ForConditionalGeneration.from_pretrained('baseline_model/model')

for i in range(5):
    
    # get formatted input and target
    story, highlights = format_story('ernesto/cnn_stories_tokenized/'+val_files[i])
    
    #train the model
    input_ids = tokenizer('summarize: ' + '\n\n@highlight\n\n'.join([story]+highlights[1:]),
                                                                    return_tensors='pt').input_ids
    labels = tokenizer(highlights[0], return_tensors='pt').input_ids
    
    # compute loss (this returns an array of things)
    loss = baseline_model(input_ids=input_ids, labels=labels).loss
    
    # print loss (sum of array values above)
    print(loss)

tensor(1.0389, grad_fn=<NllLossBackward>)
tensor(2.0359, grad_fn=<NllLossBackward>)
tensor(4.0702, grad_fn=<NllLossBackward>)
tensor(2.7790, grad_fn=<NllLossBackward>)
tensor(3.1778, grad_fn=<NllLossBackward>)


## Evaluate

In [13]:
# for scoring outputs
rouge = Rouge()

# get list of training files
test_files = cnn_meta[cnn_meta['test']==1].reset_index()['story']

mean_rouge = []
max_rouge = []

# for i in range(len(test_files)):
for i in range(5):
    # format the text to input/target format
    story, highlights = format_story('ernesto/cnn_stories_tokenized/'+test_files[i])

    # encode the input
    encoded = tokenizer.encode('summarize: ' + story.replace('\n',' '), return_tensors='pt')

    # generate the output
    output = baseline_model.generate(encoded, num_beams=4, no_repeat_ngram_size=2,
                             min_length=30, max_length=300, early_stopping=True)
    summary = tokenizer.decode(output[0])
    print(summary)
    print('')
    
    # get ROUGE scores between thoutput and highlights
    scores = [rouge.get_scores(summary,highlight)[0]['rouge-1']['f'] for highlight in highlights]

    mean_rouge.append(np.mean(scores))
    max_rouge.append(max(scores))

print('ROUGE F1 (mean):',np.mean(mean_rouge))
print('ROUGE F1 (best):',np.mean(max_rouge))

<pad> <unk> The Dukes of Hazzard '' ran until 1985 and spawned television movies, an animated series and video games</s>

<pad> It doesn't matter what anyone says, he is presumed to be innocent. A request for comment from an attorney was not returned</s>

<pad> <unk> No challenge poses more of a public threat than climate change, '' he says.<pad><pad> English lawmakers are suing the EPA for failing to reframe the issue</s>

<pad> A clip of the video features two men holding hands, a gay couple says's getting married this summer to someone I really care about</s>

<pad> Rubio is a fierce opponent of <unk> Obamacare '' and wants to repeal the law. He has warned Cuba is taking advantage of the U.S.</s>

ROUGE F1 (mean): 0.17037628522831902
ROUGE F1 (best): 0.23547359592824982
