<a href="https://colab.research.google.com/github/h5ng/GNN/blob/master/cmod_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install fairseq==0.9 transformers==2.9

Collecting fairseq==0.9
[?25l  Downloading https://files.pythonhosted.org/packages/67/bf/de299e082e7af010d35162cb9a185dc6c17db71624590f2f379aeb2519ff/fairseq-0.9.0.tar.gz (306kB)
[K     |████████████████████████████████| 307kB 10.4MB/s 
[?25hCollecting transformers==2.9
[?25l  Downloading https://files.pythonhosted.org/packages/cd/38/c9527aa055241c66c4d785381eaf6f80a28c224cae97daa1f8b183b5fabb/transformers-2.9.0-py3-none-any.whl (635kB)
[K     |████████████████████████████████| 645kB 22.0MB/s 
Collecting sacrebleu
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)
[K     |████████████████████████████████| 61kB 5.3MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/ea/59/bb06dd5ca53547d523422d32735585493e0103c992a52a97ba3aa3be33bf/tokenizers-0.7.0-cp37-cp37m-manylinux1_x86_64.whl (5.6MB)
[K     |████████████████████████████

In [3]:
import csv
import os
import logging
import argparse
import random
from tqdm import tqdm, trange
import json

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers.tokenization_bert import BertTokenizer
from transformers.modeling_bert import BertForMaskedLM, BertOnlyMLMHead

from transformers import AdamW

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/transformers-data-augmentation/bert_aug')
from data_processors import get_task_processor

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERT_MODEL = 'bert-base-uncased'

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)

logger = logging.getLogger(__name__)

In [5]:
from argparse import Namespace
parser = argparse.ArgumentParser()
args = {
    'data_dir': '/content/drive/MyDrive/transformers-data-augmentation/datasets/TREC',
    'output_dir': 'aug_data',
    'task_name': 'trec',
    'max_seq_length': 64,
    'cache': 'transformers_cache',
    'train_batch_size': 8,
    'learning_rate': 4e-5,
    'num_train_epochs': 10.0,
    'warmup_proportion': 0.1,
    'seed': 42,
    'sample_num': 1,
    'sample_ratio': 7,
    'gpu': 0,
    'temp': 1.0
}
args = Namespace(**args)

In [40]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, init_ids, input_ids, input_mask, masked_lm_labels):
        self.init_ids = init_ids
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.masked_lm_labels = masked_lm_labels

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, seed=12345):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    # ----
    # dupe_factor = 5
    masked_lm_prob = 0.15
    max_predictions_per_seq = 20
    rng = random.Random(seed)


    for (ex_index, example) in enumerate(examples):
        modified_example = example.label + " " + example.text_a
        tokens_a = tokenizer.tokenize(modified_example)
        # Account for [CLS] and [SEP] and label with "- 3"
        if len(tokens_a) > max_seq_length - 3:
            tokens_a = tokens_a[0:(max_seq_length - 3)]

        # take care of prepending the class label in this code
        tokens = []
        tokens.append("[CLS]")
        for token in tokens_a:
            tokens.append(token)
        tokens.append("[SEP]")
        masked_lm_labels = [-100] * max_seq_length

        cand_indexes = []
        for (i, token) in enumerate(tokens):
            # making sure that masking of # prepended label is avoided
            if token == "[CLS]" or token == "[SEP]" or (token in label_list and i == 1):
                continue
            cand_indexes.append(i)

        rng.shuffle(cand_indexes)
        len_cand = len(cand_indexes)

        output_tokens = list(tokens)

        num_to_predict = min(max_predictions_per_seq,
                             max(1, int(round(len(tokens) * masked_lm_prob))))

        masked_lms_pos = []
        covered_indexes = set()
        for index in cand_indexes:
            if len(masked_lms_pos) >= num_to_predict:
                break
            if index in covered_indexes:
                continue
            covered_indexes.add(index)

            masked_token = None
            # 80% of the time, replace with [MASK]
            if rng.random() < 0.8:
                masked_token = "[MASK]"
            else:
                # 10% of the time, keep original
                if rng.random() < 0.5:
                    masked_token = tokens[index]
                # 10% of the time, replace with random word
                else:
                    masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]]

            masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0]
            output_tokens[index] = masked_token
            masked_lms_pos.append(index)

        init_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_ids = tokenizer.convert_tokens_to_ids(output_tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            init_ids.append(0)
            input_ids.append(0)
            input_mask.append(0)

        assert len(init_ids) == max_seq_length
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length

        if ex_index < 2:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                [str(x) for x in tokens]))
            logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels]))

        features.append(
            InputFeatures(init_ids=init_ids,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          masked_lm_labels=masked_lm_labels))
    return features

def prepare_data(features):
    all_init_ids = torch.tensor([f.init_ids for f in features], dtype=torch.long)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in features],
                                        dtype=torch.long)
    tensor_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_masked_lm_labels)
    return tensor_data


In [53]:
def train_cmodbert_and_augment(args, example_index):
  task_name = args.task_name
  os.makedirs(args.output_dir, exist_ok=True)

  random.seed(args.seed)
  np.random.seed(args.seed)
  torch.manual_seed(args.seed)

  processor = get_task_processor(task_name, args.data_dir)
  label_list = processor.get_labels(task_name)

  # load train and dev data
  train_examples = processor.get_train_examples()
  dev_examples = processor.get_dev_examples()

  print(train_examples[example_index].guid)
  print(train_examples[example_index].text_a)
  print(train_examples[example_index].text_b)
  print(train_examples[example_index].label)

  tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                            do_lower_case=True,
                                            cache_dir=args.cache)
  model = BertForMaskedLM.from_pretrained(BERT_MODEL,
                                          cache_dir=args.cache)

  tokenizer.add_tokens(label_list) # 이 부분 좀 의심스러운데
  model.resize_token_embeddings(len(tokenizer))
  model.cls = BertOnlyMLMHead(model.config)

  model.to(device)

  # train data
  train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, args.seed)
  train_data = prepare_data(train_features)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data,
                                sampler=train_sampler,
                                batch_size=args.train_batch_size)
  
  # dev data
  dev_features = convert_examples_to_features(dev_examples,
                                              label_list,
                                              args.max_seq_length,
                                              tokenizer,
                                              args.seed)
  dev_data = prepare_data(dev_features)
  dev_sampler = SequentialSampler(dev_data)
  dev_dataloader = DataLoader(dev_data,
                              sampler=dev_sampler,
                              batch_size=args.train_batch_size)
  
  num_train_steps = int(len(train_features) / args.train_batch_size * args.num_train_epochs)
  logger.info("***** Running training *****")
  logger.info("  Num examples = %d", len(train_features))
  logger.info("  Batch size = %d", args.train_batch_size)
  logger.info("  Num steps = %d", num_train_steps)

  # optimizer
  t_total = num_train_steps
  no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.01},
      {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)

  best_dev_loss = float('inf')
  print(best_dev_loss)
  for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
    avg_loss = 0.
    model.train()
    for step, batch in enumerate(train_dataloader):
      batch = tuple(t.to(device) for t in batch)
      _, input_ids, input_mask, masked_ids = batch
      inputs = {'input_ids': batch[1],
                'attention_mask': batch[2],
                'masked_lm_labels': batch[3]}

      outputs = model(**inputs)
      loss = outputs[0]
      # loss = model(input_ids, segment_ids, input_mask, masked_ids)
      loss.backward()
      avg_loss += loss.item()
      optimizer.step()
      model.zero_grad()
      if (step + 1) % 50 == 0:
          print("avg_loss: {}".format(avg_loss / 50))
      avg_loss = 0.

    # eval on dev after every epoch
    dev_loss = compute_dev_loss(model, dev_dataloader)
    print("Epoch {}, Dev loss {}".format(epoch, dev_loss))
    if dev_loss < best_dev_loss:
      best_dev_loss = dev_loss
      print("Saving model. Best dev so far {}".format(best_dev_loss))
      save_model_path = os.path.join(args.output_dir, 'best_cmodbert.pt')
      torch.save(model.state_dict(), save_model_path)

In [54]:
train_cmodbert_and_augment(args, 1)

/content/drive/MyDrive/transformers-data-augmentation/datasets/TREC
train-1
How long is human gestation ?
None
5


03/10/2021 16:21:30 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at transformers_cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
03/10/2021 16:21:31 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at transformers_cache/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
03/10/2021 16:21:31 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
 

inf


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


KeyboardInterrupt: ignored