<a href="https://colab.research.google.com/github/duanchi1230/NLP_Project_AI2_Reasoning_Challenge/blob/master/t5_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 4.7MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 20.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 45.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |██████████

In [0]:
from pathlib import Path
import csv
import random
import json

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset

from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import AdamW

In [0]:
output_dir = './out/'
cache_dir = './cache/'
seed = 2020
t5_model = 't5-base'
weight_decay = 0.01
adam_epsilon = 1e-6
learning_rate = 2e-5
num_train_epochs = 10
eval_batch_size = 32
train_batch_size = 16
max_seq_length = 128
mode = 'train'
start_epoch = 1

In [0]:
# make output directory and cache directory
Path(output_dir).mkdir(exist_ok=True)
Path(cache_dir).mkdir(exist_ok=True)

# Save file names for both training and testing
output_model_file = Path.cwd() / output_dir / "model.bin"
output_config_file = Path.cwd() / output_dir / 'config.bin'

# device information
device = torch.device("cuda")

# Set random seeds
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


def get_tokenizer():
  tokenizer = T5Tokenizer.from_pretrained(t5_model)
  # tokenizer.add_tokens(['<choice1>', '<choice2>', '<choice3>', '<choice4>', '<choice5>'])
  return tokenizer


def get_model(tokenizer_len=None):
  if mode == 'train' or mode == 'test_without_train':
    model = T5ForConditionalGeneration.from_pretrained(
        t5_model, cache_dir=cache_dir)
    if tokenizer_len is not None:
      model.resize_token_embeddings(tokenizer_len)
  elif mode == 'test' or mode == 'continue_train':
    model = T5ForConditionalGeneration(
        T5Config.from_json_file(output_config_file))
    model.load_state_dict(torch.load(output_model_file))
  else:
    raise NotImplementedError(
        f'No such mode called {mode}, error raised from get_model.')
  return model.to(device)


def get_optimizer(model):
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in model.named_parameters() if not any(
          nd in n for nd in no_decay)], 'weight_decay': weight_decay},
      {'params': [p for n, p in model.named_parameters() if any(
          nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters,
                    lr=learning_rate, eps=adam_epsilon)
  return optimizer


def convert_string_to_ids(tokenizer, strings):
  # input_ids, attention_mask = [], []
  # for i, string in enumerate(strings):
  #   token_info = tokenizer.encode_plus(string,
  #                                      max_length=max_seq_length,
  #                                      pad_to_max_length=True)
  #   input_ids.append(token_info['input_ids'])
  #   attention_mask.append(token_info['attention_mask'])

  # return torch.tensor(input_ids, dtype=torch.long), \
  #     torch.tensor(attention_mask, dtype=torch.long)
  token_info = tokenizer.batch_encode_plus(
      strings,
      max_length=max_seq_length,
      pad_to_max_length=True,
      return_attention_masks=True,
      return_tensors='pt')
  return token_info['input_ids'], token_info['attention_mask']


def load_dataset(train_test_dev, tokenizer):
  input_texts, target_texts = [], []
  path = Path('data4T5/') / (train_test_dev + '.jsonl')
  with open(path, 'r') as f:
    for line in f:
      d = json.loads(line)
      input_texts.append(d['input_text'])
      target_texts.append(d['target_text'])

  input_ids, input_attention_mask = convert_string_to_ids(
      tokenizer, input_texts)
  target_ids, target_attention_mask = convert_string_to_ids(
      tokenizer, target_texts)

  # mask target_ids to lm_labels (-100 are ignored)
  target_attention_mask = torch.ones_like(
      target_attention_mask, dtype=torch.long) - target_attention_mask
  target_ids -= 100 * target_attention_mask
  dataset = TensorDataset(input_ids, input_attention_mask, target_ids)
  if train_test_dev == 'train':
    return DataLoader(
        dataset, shuffle=True, batch_size=train_batch_size, pin_memory=True, num_workers=4)
  else:
    return DataLoader(
        dataset, shuffle=False, batch_size=eval_batch_size, num_workers=4)


# def save_results_to_tsv(train_or_test, all_doc_id, all_inputs, all_labels, all_preds):
#   label_map_invert = {value: key for key, value in label_map.items()}
#   with open(Path.cwd() / output_dir / (train_or_test + '_result.tsv'), 'w') as file:
#     writer = csv.writer(file, delimiter='\t')
#     writer.writerow(['DocID', 'Example', 'Ground_truth', 'Prediction'])

#     for doc_id, inputs, labels, preds in zip(all_doc_id, all_inputs, all_labels, all_preds):
#       writer.writerow(
#           [doc_id, inputs, label_map_invert[labels], label_map_invert[preds]])

def compare_output(target, output):

  n_correct = torch.eq(target[:, 1],output[:, 2]).sum().item()

  # print(f'target: {target[:, 1].squeeze()}, output: {output[:, 2].squeeze()}')
  # print(f'total: {len(target)}, n_correct: {n_correct}')

  return len(target), n_correct

def main():
  # Get tokenizer
  tokenizer = get_tokenizer()

  # Create model
  model = get_model()

  # Training phase
  if mode == 'train' or mode == 'continue_train':
    # Prepare train set
    train_dataloader = load_dataset('train', tokenizer)
    dev_dataloader = load_dataset('dev', tokenizer)

    # Prepare optimizer
    optimizer = get_optimizer(model)

    # Start training
    print('Start training...')

    for epoch in range(num_train_epochs):
      model.train()
      # Train for one epoch, and evaluate later
      train_loss = 0

      for input_ids, attention_mask, output_ids in train_dataloader:
        outputs = model(input_ids=input_ids.to(device),
                        attention_mask=attention_mask.to(device),
                        lm_labels=output_ids.to(device))

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()

      # eval on dev set
      model.eval()
      eval_loss = 0
      n_totals, n_corrects = 0,0
      with torch.no_grad():
        for input_ids, attention_mask, target_ids in dev_dataloader:
          outputs = model(input_ids=input_ids.to(device),
                          attention_mask=attention_mask.to(device),
                          lm_labels=target_ids.to(device))

          loss = outputs[0]
          eval_loss += loss.item()
          outputs = model.generate(input_ids.to(device), attention_mask=attention_mask.to(device)).cpu()
          n_total, n_correct = compare_output(target_ids, outputs)
          n_totals += n_total
          n_corrects += n_correct
        print(f'Train epoch {start_epoch + epoch} loss: {(train_loss / len(train_dataloader)):.3f}, val loss: {(eval_loss / len(dev_dataloader)):.3f}, accuracy: {(n_corrects/n_totals):.3f}')
        
        n_totals, n_corrects = 0,0
        for input_ids, attention_mask, target_ids in train_dataloader:
          outputs = model.generate(input_ids.to(device), attention_mask=attention_mask.to(device)).cpu()
          n_total, n_correct = compare_output(target_ids, outputs)
          n_totals += n_total
          n_corrects += n_correct
        print(f'train accuracy: {(n_corrects/n_totals):.3f}')
    # save final model
    model_to_save = model.module if hasattr(model, 'module') else model
    torch.save(model_to_save.state_dict(),  output_model_file)
    # save config file
    model_to_save.config.to_json_file(output_config_file)

In [0]:
main()

RuntimeError: ignored

In [0]:
main()

NameError: ignored

In [0]:
main()

Start training...
Train epoch 31 loss: 0.609, val loss: 0.686, accuracy: 0.371
train accuracy: 0.533
Train epoch 32 loss: 0.607, val loss: 0.693, accuracy: 0.374
train accuracy: 0.545
Train epoch 33 loss: 0.592, val loss: 0.690, accuracy: 0.379
train accuracy: 0.551
Train epoch 34 loss: 0.579, val loss: 0.693, accuracy: 0.360
train accuracy: 0.565
Train epoch 35 loss: 0.572, val loss: 0.708, accuracy: 0.380
train accuracy: 0.577
Train epoch 36 loss: 0.565, val loss: 0.697, accuracy: 0.378
train accuracy: 0.589
Train epoch 37 loss: 0.554, val loss: 0.701, accuracy: 0.379
train accuracy: 0.596
Train epoch 38 loss: 0.545, val loss: 0.711, accuracy: 0.385
train accuracy: 0.611
Train epoch 39 loss: 0.542, val loss: 0.726, accuracy: 0.385
train accuracy: 0.614
Train epoch 40 loss: 0.534, val loss: 0.705, accuracy: 0.384
train accuracy: 0.628


In [0]:
main()

Start training...
Train epoch 21 loss: 0.519, val loss: 0.716, accuracy: 0.394
train accuracy: 0.640
Train epoch 22 loss: 0.515, val loss: 0.741, accuracy: 0.393
train accuracy: 0.647
Train epoch 23 loss: 0.500, val loss: 0.727, accuracy: 0.399
train accuracy: 0.653
Train epoch 24 loss: 0.485, val loss: 0.739, accuracy: 0.386
train accuracy: 0.665
Train epoch 25 loss: 0.475, val loss: 0.773, accuracy: 0.389
train accuracy: 0.668
Train epoch 26 loss: 0.470, val loss: 0.749, accuracy: 0.388
train accuracy: 0.678
Train epoch 27 loss: 0.459, val loss: 0.751, accuracy: 0.389
train accuracy: 0.688
Train epoch 28 loss: 0.449, val loss: 0.780, accuracy: 0.392
train accuracy: 0.695
Train epoch 29 loss: 0.447, val loss: 0.793, accuracy: 0.390
train accuracy: 0.697
Train epoch 30 loss: 0.440, val loss: 0.768, accuracy: 0.399
train accuracy: 0.708


In [0]:
main()

RuntimeError: ignored