### Offline/Google COLAB Training script for Bart-Base Predicate Model.

All cells should run, however we first need to point to the dataset that is saved in the
the cs287_project folder in Google Drive.

(Uncomment linking to Google Drive Folder if running on COLAB)

In [1]:
# !pip install torch
# !pip install transformers datasets rouge_score
import logging
import os
import re
import sys
from dataclasses import dataclass, field
from typing import Optional

import numpy as np
import pandas as pd
import nltk
import torch
import transformers
from datasets import load_dataset, load_metric, load_from_disk
from rouge_score import rouge_scorer
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBartTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import is_main_process
from datasets import load_dataset

transformers.logging.set_verbosity_info()

DATA_PATH = "data/"

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd "/content/drive/My Drive/cs287_project/"

In [3]:
!nvidia-smi

Wed Dec  8 04:07:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 496.74       Driver Version: 496.74       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:02:00.0 Off |                  N/A |
|  0%   41C    P8     2W / 400W |    180MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ... WDDM  | 00000000:04:00.0  On |                  N/A |
|  0%   26C    P8    21W / 320W |    945MiB / 10240MiB |     11%      Default |
|       

In [4]:
"""
Load model
"""

MODEL_NAME = "facebook/bart-base"
max_target_length = 128
max_source_length = 1024
padding = False

config = AutoConfig.from_pretrained(
    MODEL_NAME, cache_dir=None, revision="main", use_auth_token=False,
    max_position_embeddings = max_source_length

)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, cache_dir=None, use_fast=True, revision="main", use_auth_token=False,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    config=None,
    cache_dir=None,
    revision="main",
    use_auth_token=False,
)

# Set decoder_start_token_id
if model.config.decoder_start_token_id is None and isinstance(tokenizer, MBartTokenizer):
    model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
if model.config.decoder_start_token_id is None:
    raise ValueError


loading configuration file https://huggingface.co/facebook/bart-base/resolve/main/config.json from cache at C:\Users\Geoff/.cache\huggingface\transformers\f5310d276a6d1648d00c32fadc8bf7b4607e0fbd5b404fc4a0045960aa2bdfdb.da0f3c0e2dc1c2fecc46738a1ebf4806f2fc36aae3d5c1947f21e063e7cab34b
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointin

In [5]:
# Metric
metric = load_metric("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if data_args.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [30]:
from datasets import load_dataset, load_metric, load_from_disk
dataset = load_from_disk("data/xsum_corrupted_predicate_untokenized/train")
rm_so_filter = np.array(dataset['corrupted_flag']) != 's_o'
dataset = dataset.select(list(rm_so_filter))
dataset = dataset.train_test_split(train_size=0.95, seed=0)
xsum_train = dataset['train']
xsum_eval = dataset['test']

In [31]:
"""
Load Data
"""

# xsum_train = load_dataset("xsum", split="train")
# xsum_eval = load_dataset("xsum", split="validation")

column_names = xsum_train.column_names
text_column, corrupted_column, target_column = "document", "corrupted_summary", "summary"
# max_source_length = 1024

sep_token = '</s>'
def preprocess_function(examples):

    inputs = examples[text_column]
    corruptions = examples[corrupted_column]
    targets = examples[target_column]

    # Tokenize Input
    model_inputs = tokenizer(
        text = inputs, text_pair=corruptions, max_length=max_source_length, 
        padding=padding, truncation='only_first', add_special_tokens=True
    )
    

    # # Remove EOS tag
    # # model_inputs['input_ids'] =  model_inputs['input_ids'][:-1][:max_source_length-len(model_inputs_summary)] + model_inputs_summary
    # model_inputs['input_ids'] =  model_inputs['input_ids'][:max_source_length-len(model_inputs_summary)] + model_inputs_summary
    # model_inputs['attention_mask'] = [1]*len(model_inputs['input_ids'])
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, padding=padding, truncation=True
        )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    
    """
    Uncomment following strings to check if we've encoded properly
    """
    # model_inputs['input_string'] = inputs
    # model_inputs['target_string'] = targets

    return model_inputs

train_dataset = xsum_train
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=None,
    remove_columns=column_names,
    load_from_cache_file=False,
)

eval_dataset = xsum_eval
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=None,
    remove_columns=column_names,
    load_from_cache_file=False,
)

  0%|          | 0/129 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [32]:
from transformers import Seq2SeqTrainingArguments

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id)

NUM_GPU = 1
check_val = True

args = Seq2SeqTrainingArguments(
    output_dir="exp/bart_corrupted_predicate_no_so/results",
    do_train=True,
    do_eval=check_val,
    evaluation_strategy="steps" if check_val else 'no',
    # eval_steps=100000000,
    logging_dir="exp/bart_corrupted_predicate_no_so/logs",
    num_train_epochs=5,
    logging_steps=1000,
    eval_steps = 5000 if check_val else None,
    # max_steps=int(10000 * 8 / NUM_GPU),
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=None,
    lr_scheduler_type='polynomial',
    learning_rate=5e-05,
    warmup_steps=2000,
    save_steps=20000,
    generation_max_length=64,
)


# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics if check_val == True else None
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
trainer.args._n_gpu = 1

In [34]:
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 128962
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 161205


Step,Training Loss,Validation Loss
5000,0.0005,0.0
10000,0.0022,1e-06
15000,0.0029,0.0
20000,0.0016,0.0
25000,0.0,0.0
30000,0.0,0.006628
35000,0.0001,0.014408
40000,0.0001,0.010345
45000,0.0,0.034755
50000,0.0,0.034669


***** Running Evaluation *****
  Num examples = 6788
  Batch size = 4
***** Running Evaluation *****
  Num examples = 6788
  Batch size = 4
***** Running Evaluation *****
  Num examples = 6788
  Batch size = 4
***** Running Evaluation *****
  Num examples = 6788
  Batch size = 4
Saving model checkpoint to exp/bart_corrupted_predicate_no_so/results\checkpoint-20000
Configuration saved in exp/bart_corrupted_predicate_no_so/results\checkpoint-20000\config.json
Model weights saved in exp/bart_corrupted_predicate_no_so/results\checkpoint-20000\pytorch_model.bin
tokenizer config file saved in exp/bart_corrupted_predicate_no_so/results\checkpoint-20000\tokenizer_config.json
Special tokens file saved in exp/bart_corrupted_predicate_no_so/results\checkpoint-20000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6788
  Batch size = 4
***** Running Evaluation *****
  Num examples = 6788
  Batch size = 4
***** Running Evaluation *****
  Num examples = 6788
  Batch size = 4
*

In [None]:
# from accelerate import Accelerator

# accelerator = Accelerator()
# device = accelerator.device

# model = torch.nn.Transformer().to(device)
# optimizer = torch.optim.Adam(model.parameters())

# # dataset = load_from_disk("data/xsum_corrupted_predicate_untokenized/train")
# data = torch.utils.data.DataLoader(eval_dataset, shuffle=True)

# model, optimizer, data = accelerator.prepare(model, optimizer, data)

# model.train()
# for epoch in range(10):
#     for _, source, targets in data:
#         print(source)
#         source = source.to(device)
#         targets = targets.to(device)

#         optimizer.zero_grad()

#         output = model(source)
#         loss = F.cross_entropy(output, targets)

#         loss.backward()
#         accelerator.backward(loss)

#     optimizer.step()