In [1]:
# !pip install transformers
# !pip install rouge
# !pip install sentencepiece
# !pip install nomkl
# !pip install datasets
# !pip install pytorch

In [2]:
import os
import time
import numpy as np
import pandas as pd
import collections

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TFT5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import Dataset
from rouge import Rouge
import nltk.data
nltk.download('punkt')

#### Some functions for convenience later

In [3]:
def format_story(filename):
    """Given the CNN data file, reformats to separate the story from the highlights.
    Highlights are returned as a single string"""
    
    file = open(filename,'r')
    text = file.read()
    
    # split the story and highlights
    split_text = text.split('\n\n@highlight\n\n')
    story = split_text[0]
    highlights = split_text[1:]
    
    # return both, rejoining highlights as a single string
    return story, '. '.join(highlights)+'.'

def cos_sims(out_sent, ref_sents):
    "gets cosine similarities for an output sentence with respect to the highlight sentences. Returns the sum of values."
    
    vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
    
    # get sentence level vectors with tf-idf
    tfidf = vect.fit_transform([out_sent] + ref_sents)
    
    # get similarity matrix
    similarity_mat = tfidf * tfidf.T
    
    # only values comparing "out_sent" with each sent in "ref_sents"
    return similarity_mat.toarray()[:1,1:][0]

## Load a T5 Tokenizer and Model

In [4]:
# t5 model objects
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




In [5]:
# get list of story files in our dataset
textFiles = os.listdir('cnn_stories_tokenized')
textFiles[:5]

['0001d1afc246a7964130f43ae940af6bc6c57f01.story',
 '0002095e55fcbd3a2f366d9bf92a95433dc305ef.story',
 '00027e965c8264c35cc1bc55556db388da82b07f.story',
 '0002c17436637c4fe1837c935c04de47adb18e9a.story',
 '0003ad6ef0c37534f80b55b4235108024b407f0b.story']

# Let's get some baseline loss values

In [6]:
#text_files = ['example.story', 'example1.story', 'example2.story']

for i in range(5):
    
    # get formatted input and target
    story, highlights = format_story('cnn_stories_tokenized/'+textFiles[i])
    
    #train the model
    input_ids = tokenizer('summarize: ' + story, return_tensors='pt').input_ids
    labels = tokenizer(highlights, return_tensors='pt').input_ids
    
    # compute loss (this returns an array of things)
    loss = model(input_ids=input_ids, labels=labels).loss
    
    # print loss (sum of array values above)
    print(loss)

Token indices sequence length is longer than the specified maximum sequence length for this model (2349 > 512). Running this sequence through the model will result in indexing errors


tensor(2.7795, grad_fn=<NllLossBackward>)
tensor(2.8156, grad_fn=<NllLossBackward>)
tensor(2.7631, grad_fn=<NllLossBackward>)
tensor(2.8409, grad_fn=<NllLossBackward>)
tensor(3.4472, grad_fn=<NllLossBackward>)


# Split Train/Dev/Test like Abisee

In [33]:
source_text = []
target_text = []

start = time.time()

for i in range(1000):
    
    # get formatted input and target
    story, highlights = format_story('cnn_stories_tokenized/'+textFiles[i])
    
    source_text.append(story)
    target_text.append(highlights)
    
#     source_text.append(tokenizer('summarize: ' + story, return_tensors='tf').input_ids)
#     target_text.append(tokenizer(highlights, return_tensors='tf').input_ids)
    
    if (i+1)%50 == 0:
        print(i+1, "passed", end = ', ')
        
print('\n\ntime:', (time.time()-start)/60)

5 passed, 10 passed, 15 passed, 20 passed, 25 passed, 30 passed, 35 passed, 40 passed, 45 passed, 50 passed, 55 passed, 60 passed, 65 passed, 70 passed, 75 passed, 80 passed, 85 passed, 90 passed, 95 passed, 100 passed, 105 passed, 110 passed, 115 passed, 120 passed, 125 passed, 130 passed, 135 passed, 140 passed, 145 passed, 150 passed, 155 passed, 160 passed, 165 passed, 170 passed, 175 passed, 180 passed, 185 passed, 190 passed, 195 passed, 200 passed, 205 passed, 210 passed, 215 passed, 220 passed, 225 passed, 230 passed, 235 passed, 240 passed, 245 passed, 250 passed, 255 passed, 260 passed, 265 passed, 270 passed, 275 passed, 280 passed, 285 passed, 290 passed, 295 passed, 300 passed, 305 passed, 310 passed, 315 passed, 320 passed, 325 passed, 330 passed, 335 passed, 340 passed, 345 passed, 350 passed, 355 passed, 360 passed, 365 passed, 370 passed, 375 passed, 380 passed, 385 passed, 390 passed, 395 passed, 400 passed, 405 passed, 410 passed, 415 passed, 420 passed, 425 passed, 

In [34]:
train_1k = pd.DataFrame(list(zip(source_text, target_text)),columns =['source', 'target'])
train_1k = Dataset.from_pandas(train_1k)
# train_1k.iloc[0,1]

dataset = train_1k.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']
val_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 100
})

In [35]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('t5-base')

def tokenize(batch):
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=512)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [10]:
val_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'source', 'target'],
    num_rows: 10
})

In [36]:
start = time.time()

output_dir = 'model'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=10, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_name', # Wandb run name
    logging_steps=10, # How often to log loss to wandb
    eval_steps=10, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
trainer.save_model(output_dir + '/model')

print('\n\ntime:', (time.time()-start)/60)

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
10,0.3609,0.282724,20.0554,4.986
20,0.3402,0.281036,21.7259,4.603
30,0.3249,0.277,21.8457,4.578
40,0.3018,0.274286,21.9858,4.548
50,0.2934,0.27075,22.5894,4.427
60,0.3197,0.271339,22.6061,4.424
70,0.3212,0.26685,22.5708,4.43
80,0.2867,0.266194,22.5887,4.427
90,0.3097,0.269828,22.572,4.43
100,0.2923,0.266014,22.6264,4.42




time: 38.63646587928136


# How did Training affect the loss?

In [37]:
# SECOND LOOK

our_model = T5ForConditionalGeneration.from_pretrained('model/model')

for i in range(5):
    
    # get formatted input and target
    story, highlights = format_story('cnn_stories_tokenized/'+textFiles[i])
    
    #train the model
    input_ids = tokenizer('summarize: ' + story, return_tensors='pt').input_ids
    labels = tokenizer(highlights, return_tensors='pt').input_ids
    
    # compute loss (this returns an array of things)
    loss = our_model(input_ids=input_ids, labels=labels).loss
    
    # print loss (sum of array values above)
    print(loss)

Token indices sequence length is longer than the specified maximum sequence length for this model (2349 > 512). Running this sequence through the model will result in indexing errors


tensor(1.0787, grad_fn=<NllLossBackward>)
tensor(0.9237, grad_fn=<NllLossBackward>)
tensor(0.7326, grad_fn=<NllLossBackward>)
tensor(0.7725, grad_fn=<NllLossBackward>)
tensor(2.1319, grad_fn=<NllLossBackward>)


In [13]:
# with open('story_list.txt', 'w') as f:
#     for item in textFiles:
#         f.write("%s\n" % item)

In [38]:
# format the text to input/target format
story, highlights = format_story('cnn_stories_tokenized/'+textFiles[1000])

# encode the summary
encoded = tokenizer.encode('summarize: ' + story.replace('\n',' '), return_tensors='pt')

# decode
output = our_model.generate(encoded, num_beams=4, no_repeat_ngram_size=2,
                         min_length=30, max_length=300, early_stopping=True)
summary = tokenizer.decode(output[0])


# How does the ouput compare to the reference summary (e.g. highlights)?

In [39]:
# PRINT HIGHLIGHTS AND SUMMARY SENTENCES

splitter = nltk.data.load('tokenizers/punkt/english.pickle')

highlights_split = splitter.tokenize(highlights)

print('~'*100)
print('known highlights:')
print('~'*100)
for i in range(len(highlights_split)-1):
    print(highlights_split[i])
    print('')

print('')
    
summ_sentences = splitter.tokenize(summary)   

print('~'*100)
print('summary sentences:')
print('~'*100)
for i in range(len(summ_sentences)):
    print(summ_sentences[i])
    print('')

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
known highlights:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
`` The problem still is there .

The problem is in Washington , D.C. , '' says Murrieta mayor.

Immigrant rights advocate denounces `` anti-immigrant hate language ''.

140 undocumented Central American immigrants arrive in California from Texas.


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
summary sentences:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<pad> NEW : <unk> This is a failure to enforce federal law, '' says the national border patrol chief.

The busloads of immigrants were taken to U.S. processing centers in San Diego and El Centro.

Thousands of migrants have been detained in the United States since last month.</s>



# Use Cosine Similarity to select sentence with most novel information (i.e. the sentence least similar to the highlights). We will leave one highlight out of this process to compare with the output via ROUGE at the end.

In [40]:
# for each summary sent, get the sum of the cosine similarity scores of the sent with all highlights
# lowest score = most novel info


scores = []
for sent in summ_sentences:
#     score = sum(cos_sims(sent,highlights.split('. ')))
    score = sum(cos_sims(sent,highlights_split[:-1]))
    scores.append(score)

# output the sentence with the least similarity to highlights
print('Novel Info:')
print(summ_sentences[scores.index(min(scores))])
print('')

print('Left out highlight:')
print(highlights_split[-1])

Novel Info:
Thousands of migrants have been detained in the United States since last month.</s>

Left out highlight:
Protesters block them from being processed at the Murrieta Border Patrol station.


# Okay! We have an output sentence. Let's compare it via ROUGE to the highlight that we reserved for comparing to this output at the end.

In [41]:
rouge = Rouge()
print('Output vs Story')
print('='*15)
print('ROUGE score:',rouge.get_scores(summ_sentences[scores.index(min(scores))],highlights_split[-1])[0]['rouge-1']['f'])

Output vs Story
ROUGE score: 0.07692307195266304


# There is room for improvement.