In [6]:
# !pip install transformers
# !pip install rouge
# !pip install sentencepiece
# !pip install nomkl
# !pip install datasets
# !pip install pytorch

In [7]:
import os
import time
import numpy as np
import pandas as pd
import collections

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TFT5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import Dataset
from rouge import Rouge
import nltk.data


#### Some functions for convenience later

In [8]:
def format_story(filename):
    """Given the CNN data file, reformats to separate the story from the highlights.
    Highlights are returned as a single string"""
    
    file = open(filename,'r')
    text = file.read()
    
    # split the story and highlights
    split_text = text.split('\n\n@highlight\n\n')
    source = '\n\n@highlight\n\n'.join(split_text[:-1])
    target = split_text[-1]
    
    # return both, rejoining highlights as a single string
    return source, target

def cos_sims(out_sent, ref_sents):
    "gets cosine similarities for an output sentence with respect to the highlight sentences. Returns the sum of values."
    
    vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
    
    # get sentence level vectors with tf-idf
    tfidf = vect.fit_transform([out_sent] + ref_sents)
    
    # get similarity matrix
    similarity_mat = tfidf * tfidf.T
    
    # only values comparing "out_sent" with each sent in "ref_sents"
    return similarity_mat.toarray()[:1,1:][0]

## Load a T5 Tokenizer and Model

In [9]:
# t5 model objects
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [10]:
# get list of story files in our dataset
textFiles = os.listdir('cnn_stories_tokenized')
textFiles[:5]

['0001d1afc246a7964130f43ae940af6bc6c57f01.story',
 '0002095e55fcbd3a2f366d9bf92a95433dc305ef.story',
 '00027e965c8264c35cc1bc55556db388da82b07f.story',
 '0002c17436637c4fe1837c935c04de47adb18e9a.story',
 '0003ad6ef0c37534f80b55b4235108024b407f0b.story']

In [11]:
story, highlight = format_story('cnn_stories_tokenized/'+textFiles[0])
print(story)
print('~'*50)
print(highlight)

It 's official : U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria .

Obama sent a letter to the heads of the House and Senate on Saturday night , hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons .

The proposed legislation from Obama asks Congress to approve the use of military force `` to deter , disrupt , prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction . ''

It 's a step that is set to turn an international crisis into a fierce domestic political battle .

There are key questions looming over the debate : What did U.N. weapons inspectors find in Syria ? What happens if Congress votes no ? And how will the Syrian government react ?

In a televised address from the White House Rose Garden earlier Saturday , the president said he would take his case to Congress , not because he has to 

# Let's get some baseline loss values

In [12]:
#text_files = ['example.story', 'example1.story', 'example2.story']

for i in range(5):
    
    # get formatted input and target
    story, highlights = format_story('cnn_stories_tokenized/'+textFiles[i])
    
    #train the model
    input_ids = tokenizer('summarize: ' + story, return_tensors='pt').input_ids
    labels = tokenizer(highlights, return_tensors='pt').input_ids
    
    # compute loss (this returns an array of things)
    loss = model(input_ids=input_ids, labels=labels).loss
    
    # print loss (sum of array values above)
    print(loss)

Token indices sequence length is longer than the specified maximum sequence length for this model (2412 > 512). Running this sequence through the model will result in indexing errors


tensor(4.1225, grad_fn=<NllLossBackward>)
tensor(3.1754, grad_fn=<NllLossBackward>)
tensor(3.9362, grad_fn=<NllLossBackward>)
tensor(2.8215, grad_fn=<NllLossBackward>)
tensor(3.4645, grad_fn=<NllLossBackward>)


# Train on 1000 stories (for dev purposes only. We will use a full training set for the final report)

In [13]:
source_text = []
target_text = []

start = time.time()

for i in range(1000):
    
    # get formatted input and target
    story, highlights = format_story('cnn_stories_tokenized/'+textFiles[i])
    
    source_text.append(story)
    target_text.append(highlights)
    
#     source_text.append(tokenizer('summarize: ' + story, return_tensors='tf').input_ids)
#     target_text.append(tokenizer(highlights, return_tensors='tf').input_ids)
    
    if (i+1)%50 == 0:
        print(i+1, "passed", end = ', ')
        
print('\n\ntime:', (time.time()-start)/60)

5 passed, 10 passed, 15 passed, 20 passed, 25 passed, 30 passed, 35 passed, 40 passed, 45 passed, 50 passed, 55 passed, 60 passed, 65 passed, 70 passed, 75 passed, 80 passed, 85 passed, 90 passed, 95 passed, 100 passed, 105 passed, 110 passed, 115 passed, 120 passed, 125 passed, 130 passed, 135 passed, 140 passed, 145 passed, 150 passed, 155 passed, 160 passed, 165 passed, 170 passed, 175 passed, 180 passed, 185 passed, 190 passed, 195 passed, 200 passed, 205 passed, 210 passed, 215 passed, 220 passed, 225 passed, 230 passed, 235 passed, 240 passed, 245 passed, 250 passed, 255 passed, 260 passed, 265 passed, 270 passed, 275 passed, 280 passed, 285 passed, 290 passed, 295 passed, 300 passed, 305 passed, 310 passed, 315 passed, 320 passed, 325 passed, 330 passed, 335 passed, 340 passed, 345 passed, 350 passed, 355 passed, 360 passed, 365 passed, 370 passed, 375 passed, 380 passed, 385 passed, 390 passed, 395 passed, 400 passed, 405 passed, 410 passed, 415 passed, 420 passed, 425 passed, 

In [14]:
train_1k = pd.DataFrame(list(zip(source_text, target_text)),columns =['source', 'target'])
train_1k = Dataset.from_pandas(train_1k)
# train_1k.iloc[0,1]

dataset = train_1k.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']
val_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 100
})

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('t5-base')

def tokenize(batch):
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=512)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [16]:
val_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'source', 'target'],
    num_rows: 100
})

In [17]:
output_dir = 'model-Copy1'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=10, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_name', # Wandb run name
    logging_steps=10, # How often to log loss to wandb
    eval_steps=10, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
trainer.save_model(output_dir + '/model')

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
10,3.464,0.155527,20.765,4.816
20,0.1526,0.12274,20.7541,4.818
30,0.1334,0.108892,20.8455,4.797
40,0.1173,0.101462,20.7799,4.812
50,0.1172,0.097633,20.8826,4.789
60,0.1069,0.096775,20.8434,4.798
70,0.1073,0.096146,20.8565,4.795
80,0.1049,0.094663,21.555,4.639
90,0.1161,0.093881,21.6057,4.628
100,0.1113,0.093527,21.552,4.64


# How did Training affect the loss?

In [23]:
# SECOND LOOK

our_model = T5ForConditionalGeneration.from_pretrained('model/model')

for i in range(5):
    
    # get formatted input and target
    story, highlights = format_story('cnn_stories_tokenized/'+textFiles[i])
    
    #train the model
    input_ids = tokenizer('summarize: ' + story, return_tensors='pt').input_ids
    labels = tokenizer(highlights, return_tensors='pt').input_ids
    
    # compute loss (this returns an array of things)
    loss = our_model(input_ids=input_ids, labels=labels).loss
    
    # print loss (sum of array values above)
    print(loss)

tensor(2.1583, grad_fn=<NllLossBackward>)
tensor(2.1307, grad_fn=<NllLossBackward>)
tensor(2.0575, grad_fn=<NllLossBackward>)
tensor(1.7447, grad_fn=<NllLossBackward>)
tensor(2.2165, grad_fn=<NllLossBackward>)


In [19]:
# with open('story_list.txt', 'w') as f:
#     for item in textFiles:
#         f.write("%s\n" % item)

In [29]:
# format the text to input/target format
story, highlights = format_story('cnn_stories_tokenized/'+textFiles[6001])

# encode the summary
encoded = tokenizer.encode('summarize: ' + story.replace('\n',' '), return_tensors='pt')

# decode
output = our_model.generate(encoded, num_beams=4, no_repeat_ngram_size=2,
                         min_length=10, max_length=400, early_stopping=True)
summary = tokenizer.decode(output[0])

print(summary)
print('='*70)
print(highlights)

<pad> Juventus have sacked coach Ciro Ferrara after a string of poor results. The Bianconeri have lost five of their last six games in Serie A and have slumped to sixth in the standings. Zaccheroni will guide this afternoon's training session before being officially unveiled.</s>
Liverpool manager Rafael Benitez has also been linked to the Turin giants


In [21]:
print(story)

-LRB- CNN -RRB- -- Juventus have sacked coach Ciro Ferrara after a string of poor results and have installed Alberto Zaccheroni in the hot seat until the end of the season .

Ferrara 's position has been threatened after a dismal run which has seen them slip out of contention in the Serie A title race as well as being eliminated from the Champions League at the group stages .

Thursday 's 2-1 Coppa Italia defeat to Italian champions Inter Milan proved the final straw and the club released a statement to confirm his departure and Zaccheroni 's arrival .

`` Zaccheroni will take over the team immediately and will guide this afternoon 's training session in Vinovo before being officially unveiled to the media , '' it read .

Ferrara joined Juve late last season and steered the Turin powerhouses to second place in Serie A behind Inter .

But his first full campaign in charge proved challenging after a promising start to the season .

The Bianconeri have lost five of their last six games in

# Okay! We have an output sentence. Let's compare it via ROUGE to the highlight that we reserved for comparing to this output at the end.

In [31]:
rouge = Rouge()
print('Output vs Story')
print('='*15)
print('ROUGE score:',rouge.get_scores(summary,highlights)[0]['rouge-1']['f'])

Output vs Story
ROUGE score: 0.0689655139595721


# There is room for improvement.