In [1]:
import os
import numpy as np
import pandas as pd
import nltk.tokenize
import re
import random
from nltk.util import ngrams
import tqdm
from nltk.tokenize import RegexpTokenizer
import torch

In [2]:
from transformers import LEDTokenizer, LongformerTokenizer, LEDForConditionalGeneration
import torch
from transformers import TrainingArguments, Trainer

### Collect Data

In [3]:
def read_text(path):
    files= os.listdir(path) 
    results = {'text':[], 'highlight': [], 'highlight_1':[], 'highlight_2':[], 'highlight_3':[], 'highlight_4':[]}
    for file in tqdm.tqdm(files):
        if not os.path.isdir(file):
            file_name = path + '/'+file
            with open(file_name, encoding="utf-8") as f:
                text = (f.read()).replace('\n', " ").replace("(CNN)", "").replace("--", "")
                if len(text)<1000:
                    continue
                text_highlights = text.split("@highlight")
                final_text = text_highlights[0]
                results['text'].append(final_text.strip())
                all_highlight = ""
                for i in range(1, 5):
                    key = 'highlight_'+str(i)
                    if i<len(text_highlights):
                        results[key].append(text_highlights[i])
                        all_highlight += text_highlights[i] + '.'
                    else:
                        results[key].append("")
                results['highlight'].append(all_highlight.strip())
    return pd.DataFrame(results)

In [5]:
test_data = read_text('test_data/')

100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:01<00:00, 1239.84it/s]


In [6]:
test_data

Unnamed: 0,text,highlight,highlight_1,highlight_2,highlight_3,highlight_4
0,It's official: U.S. President Barack Obama wan...,Syrian official: Obama climbed to the top of t...,Syrian official: Obama climbed to the top of...,Obama sends a letter to the heads of the Hou...,Obama to seek congressional approval on mili...,"Aim is to determine whether CW were used, no..."
1,This week the Supreme Court heard two historic...,Ken Klukowski: Cases heard by Supreme Court co...,Ken Klukowski: Cases heard by Supreme Court ...,He says there are questions of whether cases...,"If court issues sweeping ruling, it could de...",Klukowski: Gay marriage is such a new phenom...
2,"Zango Town, Liberia At the gravesite in a no...",Liberia is one of the countries worst-hit by t...,Liberia is one of the countries worst-hit by...,Entire towns and villages have been placed i...,Health workers must ensure those who die of ...,"""Running away from Ebola is not a solution ..."
3,The big winners of this Formula One season cou...,The first race of the 2014 Formula One season ...,The first race of the 2014 Formula One seaso...,"Turbo engines are back in the sport, with ea...",Former F1 winner Jody Scheckter expects F1 t...,"For the first time in the sport's history, d..."
4,If that car parked in Harvard Yard is a rockin...,Harvard bans all romantic relationships betwee...,Harvard bans all romantic relationships betw...,Policy comes on heels of investigation into ...,,
...,...,...,...,...,...,...
1977,WASHINGTON The Obama administration is givin...,Departure of General Motors' CEO part of gover...,Departure of General Motors' CEO part of gov...,"GM official: White House signaled that ""new ...",Officials: GM to get 60 days of financing; C...,"GM, Chrysler were told to prove viability to..."
1978,NFL star Adrian Peterson pleaded no contest Tu...,Adrian Peterson says he loves his son and regr...,Adrian Peterson says he loves his son and re...,DA says the NFL star received no special tre...,"Peterson is on probation for 2 years, will m...","He is still on the Vikings roster, but has b..."
1979,(EW.com) Moms and Dads: Get your kids to take...,"Movie critics have crowned ""The Lego Movie"" as...","Movie critics have crowned ""The Lego Movie"" ...",Reviews are pegging it as a cross between Pi...,,
1980,The man who made Formula One's bravest comebac...,"Niki Lauda says Ferrari has made a ""very good""...","Niki Lauda says Ferrari has made a ""very goo...",The three-time world champion says it will a...,He warns managing Raikkonen and Alonso in 20...,Lauda says bringing Lewis Hamilton to Merced...


### Load Model

In [7]:
device = "cuda"

In [8]:
model_dir = "./LED_model/"

generate_model = LEDForConditionalGeneration.from_pretrained(model_dir) 
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

In [10]:
generate_model.to(device)

LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LEDLearnedPositionalEmbedding(2048, 768)
      (layers): ModuleList(
        (0): LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
          (s

### Generate Prediction & Test

In [10]:
import nltk.tokenize
import re
import random
from nltk.util import ngrams
import tqdm
from nltk.tokenize import RegexpTokenizer
from bert_score import score

# Import test function 
import test_baseline

In [16]:
# Define Generation Function

def generate_summary(summary_model, text, device):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length = 1536)
    inputid = tokens['input_ids'].to(device)
    mask = tokens['attention_mask'].to(device)
    pre_ids = summary_model.generate(input_ids = inputid, attention_mask = mask,
                                     min_length = 64, max_length = 128)
    pre_tokens = tokenizer.decode(pre_ids[0])
    return pre_tokens.replace("<s>", '').replace("</s>", '')


In [12]:
# Get Scores

test_tokenizer = RegexpTokenizer(r'\w+')

r_1 = 0
r_2 = 0
n = 2
predict_list = []
ref_list = []
for index, row in tqdm.tqdm(test_data.iterrows()):
    predict = generate_summary(generate_model, row['text'], device)
    predict_tokens = test_tokenizer.tokenize(predict)
    reference_tokens = test_tokenizer.tokenize(row['highlight'])
    predict_list.append(predict)
    ref_list.append(row['highlight'])
    r_1 += test_baseline.rouge_1(predict_tokens, reference_tokens)
    r_2 += test_baseline.rouge_n(predict_tokens, reference_tokens, n)

1982it [39:43,  1.20s/it]


In [13]:
r_1

2077.1114204404357

In [14]:
r_2

1249.3672524816159

In [25]:
P, R, F1  = score(predict_list, ref_list, lang = "en", verbose = True)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/31 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/31 [00:00<?, ?it/s]

done in 28.99 seconds, 68.37 sentences/sec


In [27]:
torch.sum(P)

tensor(1639.0594)

In [28]:
torch.sum(R)

tensor(1636.1899)

In [29]:
torch.sum(F1)

tensor(1637.5520)

### Generate several exmaples

In [23]:
 generate_summary(generate_model, test_data['text'][772], device)

'The U.S. Supreme Court rules on a lower court ruling on same-sex marriage .  The ruling was a first legal legal challenge in the state\'s history of the American community .  The Supreme Court ruled in 2008 that it was a "great day" for the ruling. .  The ruling was a "pantantantant decision" to end the court\'s decision.'

In [24]:
 test_data['text'][772]

"Five-time world champions Brazil needed to come from two goals down to beat underdogs the United States 3-2 in the final of the Confederations Cup in South Africa on Sunday night.  Brazil celebrated a third Confederations Cup triumph following victories in 2005 and 1997.  Captain Lucio headed an 84th-minute winner to end the hopes of the plucky Americans, who scored twice in the first half-hour in Johannesburg.  Clint Dempsey, who netted in the shock 2-0 win against European champions Spain in the semi-finals, put the U.S. ahead in the 10th minute.  A massive upset seemed to be on the cards when captain Landon Donovan doubled the lead in the 27th minute, but Brazil reduced the deficit just 41 seconds after the half-time break through Luis Fabiano.  The striker then leveled with his fifth goal of the tournament in the 74th minute before Lucio rose highest to ensure that Brazil retained the title.  Brazil went into the game seeking a massive improvement on their dismal showing in labori