In [42]:
!nvidia-smi

Tue Mar 22 13:43:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    34W / 250W |  10145MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
# clone repro and install deps
!git clone -b main https://github.com/dleve123/topics-in-nlp-repro-project
%cd /content/topics-in-nlp-repro-project/
!pip install -r requirements.txt

# mount validation data
from google.colab import drive
drive.mount('/content/drive')

In [41]:
%cd /content/topics-in-nlp-repro-project/

/content/topics-in-nlp-repro-project


In [4]:
! git fetch
! git reset --hard origin/main

HEAD is now at 528cc61 Changed default bath size to 2


In [None]:
# TODO fix later, for now run in cell
# ! python inference_correction_model.py /content/drive/MyDrive/CS6741/replication/data/tokenized/val.tokenized.jsonl

In [5]:
import argparse
from time import perf_counter
from model.correction_model import CorrectionModel
from preprocessing.prepare_train_dataset import tensors_from_jsonl_filepath


test_data = tensors_from_jsonl_filepath("/content/drive/MyDrive/CS6741/replication/data/tokenized/test.bart.tokenized.jsonl")

In [43]:
import random
random.seed(42)
test_subset = random.choices(test_data, k=250)

## Utils

In [7]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

def eval_get_corrected_indices(prediction_logits):
  FAITHFUL_CLASS = 1

  corrected_indices = []
  for idx, logits in enumerate(prediction_logits):
    # Get the top logit of the faithful class
    max_ranked = logits[:, FAITHFUL_CLASS].argmax().item()
    if max_ranked != 0:
      corrected_indices.append((idx, max_ranked))

  ratio = len(corrected_indices)/len(prediction_logits)
  print(f"Corrected: {len(corrected_indices)} ({ratio:.2%})")
  return corrected_indices

def decode_summary(model, tensor):
    end_idx = (tensor == model.tokenizer.eos_token_id).nonzero()[0].item()
    return model.tokenizer.decode(
        tensor[:end_idx],
        skip_special_tokens=True
    ), model.tokenizer.decode(
        tensor[end_idx:],
        skip_special_tokens=True
    )
  
def decode_corrected_summaries(dset, model, corrected_indices):
  summaries = []
  for data_idx, sum_idx in tqdm(corrected_indices):
    corr_summary, source = decode_summary(
        model, 
        dset[data_idx][sum_idx]
    )
    orig_summary, source = decode_summary(
        model, 
        dset[data_idx][0]
    )
    summaries.append({
        "corrected": corr_summary,
        "original": orig_summary,
        "source": source
    })
  return summaries

## EVAL - Run FEQA

In [44]:
feqa_score_cache = {}

In [45]:
# Resolve paths from root project directory
import os
import sys
from eval_feqa import download_models, evaluate as evaluate_feqa

download_models()

def evaluate_feqa_with_cache(docs, sums):
  cache_miss_docs = []
  cache_miss_sums = []
  cache_miss_idx = []
  scores = []
  for j, (sum, doc) in enumerate(zip(sums, docs)):
    if sum in feqa_score_cache:
      scores.append(feqa_score_cache[sum])
    else:
      scores.append(None)
      cache_miss_docs.append(doc)
      cache_miss_sums.append(sum)
      cache_miss_idx.append(j)
  if len(cache_miss_docs) > 0:
    eval = evaluate_feqa(
        cache_miss_docs,
        cache_miss_sums,
        "/content/drive/MyDrive/CS6741/replication/feqa-assets/squad1.0",
        "/content/drive/MyDrive/CS6741/replication/feqa-assets/checkpoints"
    )
    for score, idx, sum in zip(eval, cache_miss_idx, cache_miss_sums):
      scores[idx] = score
      feqa_score_cache[sum] = score
  return scores

def run_feqa(summaries, N=100):
  summaries = summaries[:N]
  original_sums = [x["original"] for x in summaries]
  corrected_sums = [x["corrected"] for x in summaries]
  source_docs = [x["source"] for x in summaries]
      
  orig_feqa_scores = evaluate_feqa_with_cache(
      source_docs,
      original_sums
  )
  corr_feqa_scores = evaluate_feqa_with_cache(
      source_docs,
      corrected_sums,
  )

  return orig_feqa_scores, corr_feqa_scores

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Caching feqa scores

In [87]:
import pickle
with open("/content/drive/MyDrive/CS6741/replication/feqa-assets/feqa_score_cache.pickle", "rb") as f:
  persisted_feqa_score_cache = pickle.load(f)

print("Local Cache", len(feqa_score_cache))
print("Persisted Cache", len(persisted_feqa_score_cache))

def cache_update(source, target):
  updates = 0
  for key,value in source.items():
    if key not in target:
      target[key] = value
      updates += 1
  return updates

local_updates = cache_update(
    source=persisted_feqa_score_cache,
    target=feqa_score_cache
)
persisted_updates = cache_update(
    source=feqa_score_cache,
    target=persisted_feqa_score_cache
)

if local_updates > 0:
  print(f"Added {local_updates} sums to local cache")

if persisted_updates > 0:
  print(f"Added {persisted_updates} sums to persisted cache, writing...")

  with open("/content/drive/MyDrive/CS6741/replication/feqa-assets/feqa_score_cache.pickle", "wb") as f:
    pickle.dump(persisted_feqa_score_cache, f)

Local Cache 4798
Persisted Cache 4798


### Random baseline

In [None]:
baseline_model = CorrectionModel()
baseline_predictions = baseline_model.batch_inference(test_subset)
baseline_corrected_indices = eval_get_corrected_indices(baseline_predictions)
baseline_summaries = decode_corrected_summaries(
    test_subset,
    baseline_model,
    baseline_corrected_indices
)
baseline_summaries[0]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.out_proj.weight', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 97%|█████████▋| 242/250 [00:39<00:00,  8.73it/s]

In [35]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(baseline_summaries, len(test_subset))
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/1648)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/1648)...
Generating questions (batch 128/1648)...
Generating questions (batch 192/1648)...
Generating questions (batch 256/1648)...
Generating questions (batch 320/1648)...
Generating questions (batch 384/1648)...
Generating questions (batch 448/1648)...
Generating questions (batch 512/1648)...
Generating questions (batch 576/1648)...
Generating questions (batch 640/1648)...
Generating questions (batch 704/1648)...
Generating questions (batch 768/1648)...
Generating questions (batch 832/1648)...
Generating questions (batch 896/1648)...
Generating questions (batch 960/1648)...
Generating questions (batch 1024/1648)...
Generating questions (batch 1088/1648)...
Generating questions (batch 1152/1648)...
Generating questions (batch 1216/1648)...
Generating questions (batch 1280/1648)...
Generating questions (batch 1344/1648)...
Generating questions (batch 1408/1648)...
Generating questions (batch 1472/1648)...
Generating questions (batch 1536/1648)...
Generati

# Train v1

### Checkpoint 15k steps

In [48]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-0_totalsteps-15000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    test_subset,
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

100%|██████████| 250/250 [00:40<00:00,  6.22it/s]


Corrected: 73 (29.20%)


100%|██████████| 73/73 [00:02<00:00, 30.04it/s]


{'corrected': 'A restaurant owner in southern Spain has described a "stampede" in which a group of ¬2,000 diners left his restaurant in the middle of a meal.',
 'original': 'A restaurant owner in southern Spain has described a "stampede" in which a group of 20 diners left his restaurant in the middle of a meal.',
 'source': 'The Romanian diners, who had paid a deposit of â‚¬900 ($950; Â£770), left the El Carmen restaurant in Bembibre as dessert was due to be served, Antonio Rodriguez said.\n"It happened in the space of a minute," he said. "It was something they had planned and they left in a stampede."\nThe diners owe â‚¬2,000 more, he said.\nMr Rodriguez gave police the details on the reservation but said he held out little hope of being repaid. Police told El Pais newspaper they had not yet been able to contact any of the diners.\nThe diners had consumed starters, a main course and 30 bottles of various alcoholic drinks, he said, adding that it was the first time in 35 years of worki

In [49]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=len(test_subset)
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Original sums FEQA scores 0.2892756464769482
Corrected sums FEQA scores 0.24504024798606372
DIFF -0.04423539849088448


## Checkpoint 40k steps

In [None]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-0_totalsteps-40000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

100%|██████████| 895/895 [02:19<00:00,  6.44it/s]


Corrected: 872 (97.43%)


100%|██████████| 872/872 [00:30<00:00, 28.95it/s]


{'corrected': 'A judge in the US has denied bail to one people accused of beating a black man and forcing him to kiss the floor in a Facebook Live video.',
 'original': 'A judge in the US has denied bail to four people accused of beating a black man and forcing him to kiss the floor in a Facebook Live video.',
 'source': 'Jordan Hill, Brittany Covington and Tesfaye Cooper, all 18, and Tanishia Covington, 24, appeared in a Chicago court on Friday.\nThe four have been charged with hate crimes and aggravated kidnapping and battery, among other things.\nAn online fundraiser for their victim has collected $51,000 (Â£42,500) so far.\nDenying the four suspects bail, Judge Maria Kuriakos Ciesil asked: "Where was your sense of decency?"\nProsecutors told the court the beating started in a van and continued at a house, where the suspects allegedly forced the 18-year-old white victim, who suffers from schizophrenia and attention deficit disorder, to drink toilet water and kiss the floor.\nPolice 

In [None]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=100
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/872)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/872)...
Generating questions (batch 128/872)...
Generating questions (batch 192/872)...
Generating questions (batch 256/872)...
Generating questions (batch 320/872)...
Generating questions (batch 384/872)...
Generating questions (batch 448/872)...
Generating questions (batch 512/872)...
Generating questions (batch 576/872)...
Generating questions (batch 640/872)...
Generating questions (batch 704/872)...
Generating questions (batch 768/872)...
Generating questions (batch 832/872)...
Getting answers...
Computing metrics...
Generating questions...
Tokenizing summaries for q-gen...
Generating questions (batch 0/835)...
Generating questions (batch 64/835)...
Generating questions (batch 128/835)...
Generating questions (batch 192/835)...
Generating questions (batch 256/835)...
Generating questions (batch 320/835)...
Generating questions (batch 384/835)...
Generating questions (batch 448/835)...
Generating questions (batch 512/835)...
Generating questions (batc

## Checkpoint 60k steps

In [None]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-0_totalsteps-60000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

100%|██████████| 1000/1000 [02:33<00:00,  6.51it/s]


Corrected: 987 (98.70%)


100%|██████████| 987/987 [00:35<00:00, 27.63it/s]


{'corrected': 'World number one Magnus Norman was beaten in five sets by Stan Wawrinka in the French Open final.',
 'original': 'World number one Novak Djokovic was beaten in five sets by Stan Wawrinka in the French Open final.',
 'source': 'The Swiss eighth seed played magnificently in a 4-6 6-4 6-3 6-4 victory at Roland Garros.\nWawrinka, 30, brought an end to Djokovic\'s 28-match winning streak as he claimed his second Grand Slam title.\nMedia playback is not supported on this device\nDjokovic had been hoping to become the eighth man to complete the set of all four major titles.\nBut the Serb will have to wait at least another 12 months before attempting to match the full set of major titles collected by Andre Agassi, Don Budge, Roy Emerson, Roger Federer, Rod Laver, Rafael Nadal and Fred Perry.\nThe 28-year-old had been a strong favourite to finally get his hands on the Coupe des Mousquetaires after beating nine-time champion Rafael Nadal and third seed Andy Murray, but he was outp

In [None]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=100
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Unzipping models/benepar_en3.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
1042301B [00:00, 7343195.28B/s]
456318B [00:00, 9604064.61B/s]


Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/855)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/855)...
Generating questions (batch 128/855)...
Generating questions (batch 192/855)...
Generating questions (batch 256/855)...
Generating questions (batch 320/855)...
Generating questions (batch 384/855)...
Generating questions (batch 448/855)...
Generating questions (batch 512/855)...
Generating questions (batch 576/855)...
Generating questions (batch 640/855)...
Generating questions (batch 704/855)...
Generating questions (batch 768/855)...
Generating questions (batch 832/855)...
Getting answers...
Computing metrics...
Generating questions...
Tokenizing summaries for q-gen...
Generating questions (batch 0/830)...
Generating questions (batch 64/830)...
Generating questions (batch 128/830)...
Generating questions (batch 192/830)...
Generating questions (batch 256/830)...
Generating questions (batch 320/830)...
Generating questions (batch 384/830)...
Generating questions (batch 448/830)...
Generating questions (batch 512/830)...
Generating questions (batc

In [None]:
# Max examples per doc=3
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-0_totalsteps-60000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset, max_examples_per_doc=3)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

In [None]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=100
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

## Checkpoint 100k steps (epoch 1)

In [60]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-1_totalsteps-100000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    test_subset,
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

100%|██████████| 250/250 [00:39<00:00,  6.26it/s]


Corrected: 64 (25.60%)


100%|██████████| 64/64 [00:02<00:00, 27.99it/s]


{'corrected': 'When Josephine broke her nose on the London Underground, it was the beginning of the end for Oh Wonder, the songwriting duo who went on to become a global sensation.',
 'original': 'When Josephine Skousen broke her nose on the London Underground, it was the beginning of the end for Oh Wonder, the songwriting duo who went on to become a global sensation.',
 'source': 'It forced the band to remain anonymous for almost a year: No photographs, no interviews, no videos.\nBut there was an upside. Last year, as she caught a train home from Heathrow, she sat next to a passenger "covered in blood [with] no teeth, looking sorry for himself".\n"I tentatively went up to him and his girlfriend and said, \'I just wanted to let you know you\'ll be fine\'," recalls the 27-year-old.\n"\'Go to the dentist tomorrow, don\'t panic, you\'ll be great\'. And he was like, \'Oh, thank you so much!\'.\n"And then this guy opposite us piped up, \'I broke my nose, too!\'. And suddenly this whole litt

In [61]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=len(test_subset)
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/179)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/179)...
Generating questions (batch 128/179)...
Getting answers...
Computing metrics...
Original sums FEQA scores 0.23779247539548914
Corrected sums FEQA scores 0.252162264758953
DIFF 0.014369789363463836


## Checkpoint 130k steps (epoch 1)

In [None]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-1_totalsteps-130000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    test_subset,
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

100%|██████████| 1000/1000 [02:23<00:00,  6.97it/s]


Corrected: 976 (97.60%)


100%|██████████| 976/976 [00:33<00:00, 29.30it/s]


{'corrected': 'World number one Wawrinka was beaten in five sets by Stan Wawrinka in the French Open final.',
 'original': 'World number one Novak Djokovic was beaten in five sets by Stan Wawrinka in the French Open final.',
 'source': 'The Swiss eighth seed played magnificently in a 4-6 6-4 6-3 6-4 victory at Roland Garros.\nWawrinka, 30, brought an end to Djokovic\'s 28-match winning streak as he claimed his second Grand Slam title.\nMedia playback is not supported on this device\nDjokovic had been hoping to become the eighth man to complete the set of all four major titles.\nBut the Serb will have to wait at least another 12 months before attempting to match the full set of major titles collected by Andre Agassi, Don Budge, Roy Emerson, Roger Federer, Rod Laver, Rafael Nadal and Fred Perry.\nThe 28-year-old had been a strong favourite to finally get his hands on the Coupe des Mousquetaires after beating nine-time champion Rafael Nadal and third seed Andy Murray, but he was outplayed

In [None]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=100
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/353)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/353)...
Generating questions (batch 128/353)...
Generating questions (batch 192/353)...
Generating questions (batch 256/353)...
Generating questions (batch 320/353)...
Getting answers...
Computing metrics...
Original sums FEQA scores 0.24659687865571164
Corrected sums FEQA scores 0.17105397829380842
DIFF -0.07554290036190323


## Checkpoint 140k steps (epoch 1)

In [None]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-1_totalsteps-140000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    test_subset,
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

100%|██████████| 1000/1000 [02:23<00:00,  6.96it/s]


Corrected: 349 (34.90%)


100%|██████████| 349/349 [00:11<00:00, 30.30it/s]


{'corrected': 'An inquest into the death of a woman who died in a house fire in Aberdare has been told carbon monoxide gas levels were "16 times the acceptable level".',
 'original': 'An inquest into the death of a woman who died in a house fire in Denbighshire has been told carbon monoxide gas levels were "16 times the acceptable level".',
 'source': 'The body of Kimberley Jones, 25, was found by a paramedic at the house in Cwmbach, Aberdare, on 9 August, 2013.\nMother-of-one Ms Jones was due to move into the house the following day, but was allowed to stay the night before.\nTests on the fire showed the flue was ineffective allowing smoke to leak into the room, the Aberdare inquest heard.\nDuring those tests carbon monoxide gas was at 16 times the acceptable level.\nMs Jones\' father, Andrew Jones, told the inquest he knew the owner of the house, Ms Linda Parfitt, as a family friend for 25 years and he arranged to buy the house from her to use as a home for his daughter.\nMs Parfitt 

In [None]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=100
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/77)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/77)...
Getting answers...
Computing metrics...
Generating questions...
Tokenizing summaries for q-gen...
Generating questions (batch 0/219)...
Generating questions (batch 64/219)...
Generating questions (batch 128/219)...
Generating questions (batch 192/219)...
Getting answers...
Computing metrics...
Original sums FEQA scores 0.2623776855556351
Corrected sums FEQA scores 0.2663767549918951
DIFF 0.003999069436259994


### with `max_examples_per_doc=3`


In [None]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-1_totalsteps-140000"
)
checkpoint_predictions = checkpoint_model.batch_inference(
    test_subset,
    max_examples_per_doc=3
)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    test_subset,
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

100%|██████████| 1000/1000 [01:16<00:00, 13.06it/s]


Corrected: 254 (25.40%)


100%|██████████| 254/254 [00:08<00:00, 30.85it/s]


{'corrected': 'An inquest into the death of a woman who died in a house fire in Aberdare has been told carbon monoxide gas levels were "16 times the acceptable level".',
 'original': 'An inquest into the death of a woman who died in a house fire in Denbighshire has been told carbon monoxide gas levels were "16 times the acceptable level".',
 'source': 'The body of Kimberley Jones, 25, was found by a paramedic at the house in Cwmbach, Aberdare, on 9 August, 2013.\nMother-of-one Ms Jones was due to move into the house the following day, but was allowed to stay the night before.\nTests on the fire showed the flue was ineffective allowing smoke to leak into the room, the Aberdare inquest heard.\nDuring those tests carbon monoxide gas was at 16 times the acceptable level.\nMs Jones\' father, Andrew Jones, told the inquest he knew the owner of the house, Ms Linda Parfitt, as a family friend for 25 years and he arranged to buy the house from her to use as a home for his daughter.\nMs Parfitt 

In [None]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=100
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/650)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/650)...
Generating questions (batch 128/650)...
Generating questions (batch 192/650)...
Generating questions (batch 256/650)...
Generating questions (batch 320/650)...
Generating questions (batch 384/650)...
Generating questions (batch 448/650)...
Generating questions (batch 512/650)...
Generating questions (batch 576/650)...
Generating questions (batch 640/650)...
Getting answers...
Computing metrics...
Generating questions...
Tokenizing summaries for q-gen...
Generating questions (batch 0/832)...
Generating questions (batch 64/832)...
Generating questions (batch 128/832)...
Generating questions (batch 192/832)...
Generating questions (batch 256/832)...
Generating questions (batch 320/832)...
Generating questions (batch 384/832)...
Generating questions (batch 448/832)...
Generating questions (batch 512/832)...
Generating questions (batch 576/832)...
Generating questions (batch 640/832)...
Generating questions (batch 704/832)...
Generating questions (batc

## Checkpoint 205k steps (epoch 2)

In [56]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/epoch-2_totalsteps-205000"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    test_subset,
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

KeyboardInterrupt: ignored

In [None]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=len(test_subset)
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

##  Done training (280k steps)

In [57]:
checkpoint_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/final-epoch-2_totalsteps-283380"
)
checkpoint_predictions = checkpoint_model.batch_inference(test_subset)
checkpoint_corrected_indices = eval_get_corrected_indices(checkpoint_predictions)
checkpoint_summaries = decode_corrected_summaries(
    test_subset,
    checkpoint_model,
    checkpoint_corrected_indices
)
checkpoint_summaries[0]

100%|██████████| 250/250 [00:39<00:00,  6.25it/s]


Corrected: 75 (30.00%)


100%|██████████| 75/75 [00:02<00:00, 26.10it/s]


{'corrected': 'When Josephine broke her nose on the London Underground, it was the beginning of the end for Oh Wonder, the songwriting duo who went on to become a global sensation.',
 'original': 'When Josephine Skousen broke her nose on the London Underground, it was the beginning of the end for Oh Wonder, the songwriting duo who went on to become a global sensation.',
 'source': 'It forced the band to remain anonymous for almost a year: No photographs, no interviews, no videos.\nBut there was an upside. Last year, as she caught a train home from Heathrow, she sat next to a passenger "covered in blood [with] no teeth, looking sorry for himself".\n"I tentatively went up to him and his girlfriend and said, \'I just wanted to let you know you\'ll be fine\'," recalls the 27-year-old.\n"\'Go to the dentist tomorrow, don\'t panic, you\'ll be great\'. And he was like, \'Oh, thank you so much!\'.\n"And then this guy opposite us piped up, \'I broke my nose, too!\'. And suddenly this whole litt

In [58]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  checkpoint_summaries, N=len(test_subset)
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/42)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Getting answers...
Computing metrics...
Generating questions...
Tokenizing summaries for q-gen...
Generating questions (batch 0/358)...
Generating questions (batch 64/358)...
Generating questions (batch 128/358)...
Generating questions (batch 192/358)...
Generating questions (batch 256/358)...
Generating questions (batch 320/358)...
Getting answers...
Computing metrics...
Original sums FEQA scores 0.2547294884640526
Corrected sums FEQA scores 0.2529357562174006
DIFF -0.001793732246651969


# Final Eval

In [62]:
final_model = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/final-epoch-2_totalsteps-283380"
)
final_predictions = final_model.batch_inference(
    test_data
)
final_corrected_indices = eval_get_corrected_indices(final_predictions)
final_summaries = decode_corrected_summaries(
    test_data,
    final_model,
    final_corrected_indices
)
final_summaries[0]

100%|██████████| 6792/6792 [17:13<00:00,  6.57it/s]


Corrected: 1946 (28.65%)


100%|██████████| 1946/1946 [01:05<00:00, 29.80it/s]


{'corrected': "Gulls manager Kevin Nicholson says he will not receive any money from the sale of Kane O'Kane to Bournemouth.",
 'original': "Torquay United manager Kevin Nicholson says he will not receive any money from the sale of Kane O'Kane to Bournemouth.",
 'source': 'The National League sold the Republic of Ireland midfielder to the Cherries for £175,000 in 2012 and had a 15% sell-on clause included in the deal.\nO\'Kane moved for an undisclosed fee, but Nicholson says any money will go to help the cash-strapped club.\n"I don\'t think I\'ll be getting anything," Nicholson told BBC Devon.\n"There\'s more important things."\nThe Gulls are still looking for new owners having been taken over by a consortium of local business people last summer.\nThey were forced to close down the club\'s academy and drastically reduce the playing budget after millionaire former owner Thea Bristow left the club.'}

In [85]:
import numpy as np
orig_feqa_scores, corr_feqa_scores = run_feqa(
  final_summaries, N=len(final_summaries)
)
print("Original sums FEQA scores", np.mean(orig_feqa_scores))
print("Corrected sums FEQA scores", np.mean(corr_feqa_scores))

print("DIFF", np.mean(corr_feqa_scores) - np.mean(orig_feqa_scores))

Generating questions...
Tokenizing summaries for q-gen...


  'with `validate_args=False` to turn off validation.')


Generating questions (batch 0/7581)...


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


Generating questions (batch 64/7581)...
Generating questions (batch 128/7581)...
Generating questions (batch 192/7581)...
Generating questions (batch 256/7581)...
Generating questions (batch 320/7581)...
Generating questions (batch 384/7581)...
Generating questions (batch 448/7581)...
Generating questions (batch 512/7581)...
Generating questions (batch 576/7581)...
Generating questions (batch 640/7581)...
Generating questions (batch 704/7581)...
Generating questions (batch 768/7581)...
Generating questions (batch 832/7581)...
Generating questions (batch 896/7581)...
Generating questions (batch 960/7581)...
Generating questions (batch 1024/7581)...
Generating questions (batch 1088/7581)...
Generating questions (batch 1152/7581)...
Generating questions (batch 1216/7581)...
Generating questions (batch 1280/7581)...
Generating questions (batch 1344/7581)...
Generating questions (batch 1408/7581)...
Generating questions (batch 1472/7581)...
Generating questions (batch 1536/7581)...
Generati

In [77]:
import json
if len(final_summaries) == 1946:
  with open("/content/drive/MyDrive/CS6741/replication/data/corrected/train_v1_final_280k.jsonl", "w") as f:
    for x in final_summaries:
      f.write(json.dumps(x) + "\n")
  print("Persisted summaries")

Persisted summaries


### Max examples per doc: 3

In [83]:
final_model_limit_samples = CorrectionModel(
  model_checkpoint="/content/drive/MyDrive/CS6741/replication/model_checkpoints/train_v1/final-epoch-2_totalsteps-283380"
)
final_predictions_limit_samples = final_model.batch_inference(
    test_data,
    max_examples_per_doc=3
)
final_corrected_indices_limit_samples = eval_get_corrected_indices(final_predictions_limit_samples)
final_summaries_limit_samples = decode_corrected_summaries(
    test_data,
    final_model,
    final_corrected_indices_limit_samples
)
final_summaries_limit_samples[0]

Corrected: 1327 (19.54%)


100%|██████████| 1327/1327 [00:44<00:00, 29.79it/s]


{'corrected': "Britain's Ennis-Hill remains on course to qualify for the Rio Olympics after the second day of the Hypo-Meeting in Gotzis, Germany.",
 'original': "Britain's Jessica Ennis-Hill remains on course to qualify for the Rio Olympics after the second day of the Hypo-Meeting in Gotzis, Germany.",
 'source': "The Olympic champion, 29, was third overall at the end of a promising first day - traditionally her strongest - with a score of 3,928 points.\nOn Sunday she leapt a respectable 6.16m in the long jump but threw a disappointing 42.60m in the javelin.\nWith the 800m remaining, she has 5,544 points, still on course for the 6,200 needed to qualify for the Rio Olympics.\nEnnis-Hill is competing in her first heptathlon since winning gold at London 2012.\nA top-12 finish and score of 6,075 points would also secure qualification for this summer's World Championships.\nCanada's Commonwealth champion and world silver medallist Brianne Theisen-Eaton leads ahead of the final event, remar