# WNC Full Dataset Evaluation

In [1]:
train_samples = 154197
epochs = 10
batch_size = 8

In [3]:
(train_samples / batch_size) * epochs

192746.25

## Load Fine-Tuned Model

In [1]:
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from datasets import load_from_disk, load_metric

%load_ext lab_black
%load_ext tensorboard

In [2]:
DATASETS_PATH = "/home/cdsw/data/processed/WNC_full"
MODEL_PATH = "/home/cdsw/models/bart-tst-full"

wnc_datasets = load_from_disk(DATASETS_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

## Evaluation Walkthrough

In [3]:
examples = wnc_datasets["test"][:10]
examples

{'rev_id': ['582489816',
  '660141066',
  '789327615',
  '375171903',
  '286179975',
  '229377529',
  '202866053',
  '818668820',
  '121439225',
  '12637366'],
 'source_text': ['allegations of apocrypha by opponents of the ppaca',
  'mcgowan had an excellent rookie year and was named golf digest / rolex rookie-of-the-year in 1978.',
  'hcis also organizes a yearly talent competition called the innofest where students in hcis get an opportunity to showcase their amazing talents.',
  'in the north-east of aunis there is a huge forest of hardwood trees, the forest of benon, which fortunately has been protected because it is unique to the region.',
  'on march 8, 2009, the impeccable, while monitoring submarine activity 75\xa0miles south of hainan, china, was harassed by several chinese naval ships.',
  'the new term is designed to be neutral, to avoid the social stigma associated with the conflation of "manic" and "depression."',
  'the news-press was sold by the new york times company in

In [6]:
testing = {
    "rev_id": 123,
    "source_text": ["Sir Alex Ferguson is the greatest football manager of all time."],
    "target_text": ["yada"],
}

In [7]:
max_source_length = 1024
max_target_length = 1024


def preprocess_function(examples: dict):

    inputs = examples["source_text"]
    targets = examples["target_text"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# model_inputs = preprocess_function(examples)
model_inputs = preprocess_function(testing)

In [9]:
model_inputs["input_ids"].shape

torch.Size([1, 14])

### Generate Text

In [10]:
def generate_text(examples: dict):

    model_inputs = preprocess_function(examples)

    outputs = model.generate(
        model_inputs["input_ids"],
        max_length=max_target_length,
        min_length=4,
        length_penalty=2,
        num_beams=4,
        early_stopping=True,
    )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [20]:
preds = generate_text(examples)
preds

['opposition to apocrypha by opponents of the ppaca',
 'mcgowan had a good rookie year and was named golf digest / rolex rookie-of-the-year in 1978.',
 'hcis also organizes a yearly talent competition called the innofest where students in hcis get an opportunity to showcase their talents.',
 'in the north-east of aunis there is a huge forest of hardwood trees, the forest of benon, which has been protected because it is unique to the region.',
 'on march 8, 2009, the impeccable, while monitoring submarine activity 75\xa0miles south of hainan, china, was attacked by several chinese naval ships.',
 'the new term is designed to avoid the social stigma associated with the conflation of "manic" and "depression."',
 'the news-press was sold by the new york times company in 2000, and is now independently owned by wendy p. mccaw, a local resident.',
 'the area in which the palestine pound circulated was divided into several political entities: the state of israel, the hashemite kingdom of trans

In [11]:
preds = generate_text(testing)
preds

['Sir Alex Ferguson is one of the greatest football managers of all time.']

In [21]:
labels = examples["target_text"]
labels

['allegations by opponents of the ppaca',
 'mcgowan was named golf digest / rolex rookie-of-the-year in 1978.',
 'hcis also organizes a yearly talent competition called the innofest where students in hcis get an opportunity to showcase their talents.',
 'in the north-east of aunis there is a huge forest of hardwood trees, the forest of benon, which has been protected because it is unique to the region.',
 'on march 8, 2009, the impeccable, while monitoring submarine activity 75\xa0miles south of hainan, china, was engaged by several chinese naval ships.',
 'the new term is designed to avoid the social stigma associated with the conflation of "manic" and "depression."',
 'the news-press was sold by the new york times company in 2000, and is now independently owned by wendy p. mccaw, a local resident and outspoken environmentalist .',
 'the area in which the palestine pound circulated was divided into several political entities: the state of israel, the hashemite kingdom of transjordan, 

In [25]:
inputs = examples["source_text"]
inputs

['allegations of apocrypha by opponents of the ppaca',
 'mcgowan had an excellent rookie year and was named golf digest / rolex rookie-of-the-year in 1978.',
 'hcis also organizes a yearly talent competition called the innofest where students in hcis get an opportunity to showcase their amazing talents.',
 'in the north-east of aunis there is a huge forest of hardwood trees, the forest of benon, which fortunately has been protected because it is unique to the region.',
 'on march 8, 2009, the impeccable, while monitoring submarine activity 75\xa0miles south of hainan, china, was harassed by several chinese naval ships.',
 'the new term is designed to be neutral, to avoid the social stigma associated with the conflation of "manic" and "depression."',
 'the news-press was sold by the new york times company in 2000, and is now independently owned by wendy p. mccaw, a local resident and a biased and eerily thick-headed woman .',
 'the area in which the palestine pound circulated was divide

In [28]:
for i, (inp, pred, label) in enumerate(zip(inputs, preds, labels)):
    print(i, "\n")
    print(f"INPUT: {inp}")
    print()
    print(f"TRUTH: {label}")
    print()
    print(f"PREDI: {pred}")
    print()
    print("-------------------------------------------------")

0 

INPUT: allegations of apocrypha by opponents of the ppaca

TRUTH: allegations by opponents of the ppaca

PREDI: opposition to apocrypha by opponents of the ppaca

-------------------------------------------------
1 

INPUT: mcgowan had an excellent rookie year and was named golf digest / rolex rookie-of-the-year in 1978.

TRUTH: mcgowan was named golf digest / rolex rookie-of-the-year in 1978.

PREDI: mcgowan had a good rookie year and was named golf digest / rolex rookie-of-the-year in 1978.

-------------------------------------------------
2 

INPUT: hcis also organizes a yearly talent competition called the innofest where students in hcis get an opportunity to showcase their amazing talents.

TRUTH: hcis also organizes a yearly talent competition called the innofest where students in hcis get an opportunity to showcase their talents.

PREDI: hcis also organizes a yearly talent competition called the innofest where students in hcis get an opportunity to showcase their talents.



**Interesting Ones**: 1, 3, 6

### Open Questions:
1. Why does eval loss increase while eval accuracy/bleu also increase?
    - "In typical text generation settings, there exists a discrepancy between the training objective and evaluation criteria." 
    - [this paper](https://sailinglab.github.io/pgm-spring-2019/assets/project/final-reports/project3.pdf) introduces DEBLEU which is differentiable version of BLEU as loss function specifically for TST
2. What does performance look like across cohorts (length_delta, generation length)??   
3. [This paper] analyzes and discusses various TST evaluation metrics and how the correlate to human level evaluation.