In [1]:
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration, pipeline
import tensorflow as tf
from tensorflow import *
from rouge import Rouge
import collections
import numpy as np

## Learning how to ROUGE

In [2]:
# testing out this rouge package
# googled to find, so we should do some research about a potential better package

s1 = """I am trying as best I can, using as many words as I can, to make this sentence as long as possible,
        though it has little meaning."""
s2 = 'I am trying as best I can, using as many words as I can, to make this sentence as long as possible.'
s3 = 'I am trying as best I can to make this sentence as long as possible.'
s4 = 'I am trying to make this sentence as long as possible.'

# ROUGE object
rouge = Rouge()

# get some scores
print(rouge.get_scores(s1, s2))
print('~'*70)
print(rouge.get_scores(s1, s4))
print('~'*70)
rouge.get_scores(s1, s2)[0]['rouge-1']['f']

[{'rouge-1': {'f': 0.8571428521949189,
              'p': 0.7777777777777778,
              'r': 0.9545454545454546},
  'rouge-2': {'f': 0.8510638248438207,
              'p': 0.7692307692307693,
              'r': 0.9523809523809523},
  'rouge-l': {'f': 0.7999999951020409, 'p': 0.7, 'r': 0.9333333333333333}}]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{'rouge-1': {'f': 0.5263157853601108,
              'p': 0.37037037037037035,
              'r': 0.9090909090909091},
  'rouge-2': {'f': 0.44444444043209885, 'p': 0.3076923076923077, 'r': 0.8},
  'rouge-l': {'f': 0.5999999955555556, 'p': 0.45, 'r': 0.9}}]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


0.8571428521949189

## T5, let's go!

In [3]:
# t5 model objects
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = TFT5ForConditionalGeneration.from_pretrained('t5-base')

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [4]:
# read the example story
f = open('example.story','r')
text = f.read()

# separate the higlights from the main story
split_text = text.split('\n\n@highlight\n\n')
story = split_text[0]
story

'ORME, Tennessee (CNN) -- The drought in the Southeastern United States means more than just brown lawns to the folks in Orme, Tennessee. Water flows from their taps for just three hours each evening.\n\nA 1961 firetruck loads up with water from a hydrant in Alabama to haul back to Orme, Tennessee.\n\nThe mountain spring that supplies water to the town usually dries up at the end of summer, but just for a few days. This year it dried up early, on August 1, and hasn\'t revived, leaving the town\'s 145 residents high and dry and relying on water trucked in from the next state.\n\nEvery day at 6 p.m., Orme Mayor Tony Reames turns a big valve to release water from the town\'s tank. When he turns the crank again at 9 p.m., taps in the town run dry.\n\n"When they cut it back on we jump for joy," Orme resident Debbie Cash said. "And then you only have it for three hours."\n\nThree hours to do all the laundry, bathing, dishwashing and animal watering that has to be done.  Watch how Cash copes 

## Summarize the article with our t5 model

In [5]:
# encode the summary
encoded = tokenizer.encode('summarize: ' + story.replace('\n',' '), return_tensors='tf')

# decode
output = model.generate(encoded, num_beams=4, no_repeat_ngram_size=2,
                         min_length=30, max_length=300, early_stopping=True)
summary = tokenizer.decode(output[0])

Token indices sequence length is longer than the specified maximum sequence length for this model (814 > 512). Running this sequence through the model will result in indexing errors


In [6]:
# PRINT HIGHLIGHTS AND SUMMARY SENTENCES

highlights = split_text[1:]

print('~'*100)
print('highlights:')
print('~'*100)
for i in range(len(highlights)):
    print(highlights[i])

print('')
    
summ_sentences = summary.split('. ')   

print('~'*100)
print('summary sentences:')
print('~'*100)
for i in range(len(summ_sentences)):
    print(summ_sentences[i])

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
highlights:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Orme, Tennessee, has running water from 6 p.m. to 9 p.m.
Town's spring ran dry in midst of Southeast drought
Donated water is trucked in from Alabama
Completion of pipeline will solve problem for good

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
summary sentences:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<pad> orme, tennessee, residents rely on water trucked in from the next state
town's mountain spring usually dries up at the end of summer, but just for a few days
this year it dried up early, on august 1, and hasn't revived
4.5 million people in and around Atlanta, Georgia, are nervously watching water levels go down.


# get sentence with most novel information using ROUGE

In [7]:
# for each summary sent, get the sum of the ROUGE scores of the sent with all highlights
# not very sophisticated, i'm sure we'll come up with something better
# but it's a start

scores = []
for sent in summ_sentences:
    score = sum([rouge.get_scores(sent, highlight)[0]['rouge-1']['f'] for highlight in highlights])
    scores.append(score)

# output the sentence with the least similarity to highlights
print('Novel Info:')
summ_sentences[scores.index(min(scores))]

Novel Info:


"this year it dried up early, on august 1, and hasn't revived"

# What about Cosine Similarity?

We can use TF-IDF to vectorize the sentences, and use cosime similarity to find the sentence least like the highlights.

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# make up some similar sentences
s0 = """I am trying as best I can, using as many words as I can, to make this sentence as long as possible,
        though it has little meaning."""
s1 = 'I am trying as best I can, using as many words as I can, to make this sentence as long as possible.'
s2 = 'I am trying as best I can to make this sentence as long as possible.'
s3 = 'I am trying to make this sentence as long as possible.'

# add in a totally different sentence
s4 = 'Liver tastes terrible.'

def cos_sims(out_sent, ref_sents):
    vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
    tfidf = vect.fit_transform([out_sent] + ref_sents)  

    similarity_mat = tfidf * tfidf.T

    return similarity_mat.toarray()[:1,1:][0]

similarities = cos_sims(s0,[s1,s2,s3,s4])

print('similarity of s0 to:') 
for i in range(4):
    print('s'+str(i+1),'=',similarities[i])
    print(str(1))

similarity of s0 to:
stf.Tensor(1, shape=(), dtype=int32) = 0.7907476749448838
1
stf.Tensor(2, shape=(), dtype=int32) = 0.6175531063761837
1
stf.Tensor(3, shape=(), dtype=int32) = 0.5452877544361474
1
stf.Tensor(4, shape=(), dtype=int32) = 0.0
1


# get sentence with most novel information using Cosine Sim

In [9]:
# for each summary sent, get the sum of the cosine similarity scores of the sent with all highlights
# lowest score = most novel info

scores = []
for sent in summ_sentences:
    score = sum(cos_sims(sent,highlights))
    scores.append(score)

# output the sentence with the least similarity to highlights
print('Novel Info:')
summ_sentences[scores.index(min(scores))]

Novel Info:


"this year it dried up early, on august 1, and hasn't revived"

## WOOOOHOOO, this method results in the same sentence!

#### BUT I didn't leave out a highlight to compare the sentence with. Let's run this again, but leave out one highlight. We'll use ROUGE to compare this left-out highlight to the target sentence.

In [10]:
#use cosine similarity to 

scores = []
for sent in summ_sentences:
    score = sum(cos_sims(sent,highlights[:-1]))
    scores.append(score)

# output the sentence with the least similarity to highlights
print('Novel Info:')
novel = summ_sentences[scores.index(min(scores))]
print(novel)

print('\nLeft Out Highlight:')
print(highlights[-1])

print('\nHow similar to left out highlight?')
print('ROUGE score:',rouge.get_scores(novel,highlights[-1])[0]['rouge-1']['f'])

Novel Info:
this year it dried up early, on august 1, and hasn't revived

Left Out Highlight:
Completion of pipeline will solve problem for good

How similar to left out highlight?
ROUGE score: 0.0


# OUCH.... ROUGE score was zero.
### But we know this was a decent output right? It gave some novel info (the 8/1 date). So we need to think of a new way of evaluating. What about not excluding any highlights, and getting ROUGE for the output with the entire article?

In [14]:
print('Output vs Story')
print('='*15)
print('ROUGE score:',rouge.get_scores(novel,story)[0]['rouge-1']['f'])

Output vs Story
ROUGE score: 0.03355704658528895
