In [None]:
!rm -rf /content/tapt

In [None]:
%cd /content

/content


In [None]:
!pip install dict2obj
!pip install trl
!pip install transformers==2.6.0
!pip install git+https://github.com/huggingface/nlp.git

Collecting dict2obj
  Downloading https://files.pythonhosted.org/packages/4c/e3/96f55f458f853923eb33d37fac3849f195b6d5f8c3057c4a154f61926672/dict2obj-1.2.0.tar.gz
Building wheels for collected packages: dict2obj
  Building wheel for dict2obj (setup.py) ... [?25l[?25hdone
  Created wheel for dict2obj: filename=dict2obj-1.2.0-cp36-none-any.whl size=2919 sha256=cab2a3297fb8f53179c646cb6e595dba4cb3dff9cf395b62a7ac8e5fede5e3c4
  Stored in directory: /root/.cache/pip/wheels/0c/67/8d/85e44c85b54f061c997bbab6e7f7340892f925a9f9d39e3711
Successfully built dict2obj
Installing collected packages: dict2obj
Successfully installed dict2obj-1.2.0
Collecting trl
  Downloading https://files.pythonhosted.org/packages/ea/b5/179e2a8b33cdd78b76ee8df03a7235bc38f9de37c1d88f312fd2c1dee9a0/trl-0.0.2-py3-none-any.whl
Collecting transformers==2.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl (5

In [None]:
!git clone https://github.com/chrisliu298/tapt.git
%cd /content/tapt/src

Cloning into 'tapt'...
remote: Enumerating objects: 254, done.[K
remote: Counting objects: 100% (254/254), done.[K
remote: Compressing objects: 100% (179/179), done.[K
remote: Total 701 (delta 132), reused 182 (delta 65), pack-reused 447[K
Receiving objects: 100% (701/701), 147.67 MiB | 25.37 MiB/s, done.
Resolving deltas: 100% (341/341), done.
Checking out files: 100% (61/61), done.
/content/tapt/src


In [None]:
%cd /content/tapt/src

/content/tapt/src


## GPT-2 Train and Evaluation Test

In [None]:
import collections
import math
import os

from dict2obj import Dict2Obj
from transformers import AutoConfig
from transformers import AutoModelWithLMHead
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer
from transformers import TrainingArguments
from transformers import set_seed

from utils.data_pipeline import get_dataset
from utils.metrics import evaluate_gpt2


START_TOKEN = "<|startoftext|>"
SEP_TOKEN = "<|sep|>"


def train_gpt2():
    # Model arguments
    model_args = collections.defaultdict(
        config_name="gpt2",
        model_name_or_path="gpt2-medium",
        model_type="gpt2",
        tokenizer_name="gpt2",
        cache_dir=None,
    )

    # Data arguments
    data_args = collections.defaultdict(
        train_data_file="/content/train.txt",
        eval_data_file="/content/val.txt",
        line_by_line=False,
        mlm=False,
        mlm_probability=0.15,
        block_size=512,
        overwrite_cache=False,
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir="/content",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluate_during_training=True,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=1,
        learning_rate=2e-5,
        weight_decay=0.0,
        adam_epsilon=1e-08,
        max_grad_norm=1.0,
        num_train_epochs=5,
        max_steps=-1,
        warmup_steps=0,
        logging_dir=None,
        logging_first_step=False,
        logging_steps=1000,
        eval_steps=1000,
        save_steps=2000,
        save_total_limit=100000,
        no_cuda=False,
        seed=42,
        fp16=False,
        fp16_opt_level="O1",
        local_rank=-1,
    )

    # Convert dict to objects
    model_args = Dict2Obj(model_args)
    data_args = Dict2Obj(data_args)

    # Sed seed
    set_seed(training_args.seed)

    # Load tokenizer and model
    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir
    )
    model = AutoModelWithLMHead.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Add special tokens
    tokenizer.add_special_tokens({"sep_token": SEP_TOKEN})
    tokenizer.add_special_tokens({"bos_token": START_TOKEN})
    model.resize_token_embeddings(len(tokenizer))

    # Load dataset
    train_dataset = (
        get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
    )
    eval_dataset = (
        get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
        if training_args.do_eval
        else None
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability,
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Define model path
    model_path = (
        model_args.model_name_or_path
        if model_args.model_name_or_path is not None
        and os.path.isdir(model_args.model_name_or_path)
        else None
    )

    # Train the model
    # train_results = trainer.train(model_path=model_path)
    # trainer.save_model()
    # tokenizer.save_pretrained(training_args.output_dir)

    # Evaluate the model
    ppl = evaluate_gpt2("/content/test.txt", training_args, data_args, trainer, tokenizer)
    print(ppl)

ImportError: ignored

In [None]:
train_gpt2()

## BERT Train and Evaluation Test

In [None]:
import pandas as pd
from pprint import pprint
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments

from utils.data_pipeline import prepare_data
from utils.data_pipeline import prepare_custom_data
from utils.metrics import compute_metrics



def train_bert():
    
    def tokenize(batch):
        """Tokenize a batch of data (with padding and truncation).

        Arg:
            batch: A batch of training data.
        """
        return tokenizer(
            batch["text"], padding="max_length", truncation=True, max_length=512
        )
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")

    # Load dataset
    train_dataset, val_dataset, test_dataset = prepare_data(
        tokenize_func=tokenize,
        dataset_name="yelp_polarity",
        train_count=10,
        train_size=5,
        val_size=5,
        use_all_test=False,
        test_count=10,
        test_size=5,
        others=5,
        seed=42,
    )
    # Load custom data
    augmented = prepare_custom_data(
        tokenize_func=tokenize, dataset_name="/content/nlp_yelp_train.tsv"
    )

    # Define training arguments
    training_args = TrainingArguments(
        adam_epsilon=1e-08,
        eval_steps=1000,
        logging_steps=1000,
        evaluate_during_training=True,
        gradient_accumulation_steps=1,
        learning_rate=5e-05,
        logging_dir="/content/logs",
        max_grad_norm=1.0,
        num_train_epochs=4,
        output_dir="/content/drive/My Drive/models/distilroberta",
        per_device_eval_batch_size=32,
        per_device_train_batch_size=32,
        save_steps=1000,
        seed=42,
        warmup_steps=0,
        weight_decay=0.0,
    )

    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=augmented,
        eval_dataset=val_dataset,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    train_score = trainer.evaluate(eval_dataset=train_dataset)
    val_score = trainer.evaluate(eval_dataset=val_dataset)
    test_score = trainer.evaluate(eval_dataset=test_dataset)

    pprint(train_score)
    pprint(val_score)
    pprint(test_score)


In [None]:
train_bert()

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…






HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…





HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1.0, style=ProgressStyle(description_wid…


{'epoch': 4.0,
 'eval_accuracy': 0.8,
 'eval_f1': 0.8571428571428571,
 'eval_loss': 0.6770969033241272,
 'eval_precision': 0.75,
 'eval_recall': 1.0}
{'epoch': 4.0,
 'eval_accuracy': 0.2,
 'eval_f1': 0.3333333333333333,
 'eval_loss': 0.7054764032363892,
 'eval_precision': 0.25,
 'eval_recall': 0.5}
{'epoch': 4.0,
 'eval_accuracy': 0.4,
 'eval_f1': 0.4,
 'eval_loss': 0.6909704804420471,
 'eval_precision': 0.25,
 'eval_recall': 1.0}


## Classifier Test

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

from classifier import Classifier

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/My Drive/models/distilroberta_yelp")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/models/distilroberta_yelp", use_fast=True)
clf = Classifier(model=model, tokenizer=tokenizer)

print(clf.classify("The restaurant is really bad"))
print(clf.classify("The restaurant is really good"))

[{'label': 'LABEL_0', 'score': 0.9998082518577576}]
[{'label': 'LABEL_1', 'score': 0.9994127750396729}]


## Generator Test

GPT2 PPO has to use `transformers==2.6.0`.

In [None]:
!pip uninstall transformers
!pip install transformers==2.6.0

Uninstalling transformers-3.0.2:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.6/dist-packages/transformers-3.0.2.dist-info/*
    /usr/local/lib/python3.6/dist-packages/transformers/*
Proceed (y/n)? y
  Successfully uninstalled transformers-3.0.2
Collecting transformers==2.6.0
  Using cached https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl
Collecting tokenizers==0.5.2
  Using cached https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl
Installing collected packages: tokenizers, transformers
  Found existing installation: tokenizers 0.8.1rc2
    Uninstalling tokenizers-0.8.1rc2:
      Successfully uninstalled tokenizers-0.8.1rc2
Successfully installed tokenizers-0.5.2 transformers-2.6.0


## GPT-2 Generator

In [None]:
import torch
from pprint import pprint
from transformers import GPT2LMHeadModel, GPT2Tokenizer

from generator import GPT2Generator

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   

gpt2_generator = GPT2Generator(device)

model = GPT2LMHeadModel.from_pretrained("/content/drive/My Drive/models/gpt2_imdb")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model.to(device)

prompt = "[positive] <|sep|> The movie is really"
pprint(gpt2_generator.generate(tokenizer, model, prompt)[0])

prompt = "[negative] <|sep|> The movie is really"
pprint(gpt2_generator.generate(tokenizer, model, prompt)[0])

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


('<|startoftext|> [positive] <|sep|> The movie is really quite gripping and '
 'extremely interesting. The entire movie is really well done. The acting was '
 'very good. The music really worked and added a little bit to the movie. The '
 "shooting was really well done. The make-up job was good, although it wasn't "
 'great, but could have been better. The horror factor was pretty intense and '
 "didn't appear to me to be just some sloppy work by the director. The "
 'effects, for me at least, were just what I expected, with a few hits of the '
 '"extreme" FX one could expect. I enjoyed the movie because it was really '
 "well done. There wasn't too much I didn't like, I just thought some of the "
 'plot holes were pretty big (Like the green-hued pyramid structure which I '
 "don't see in many movies) for what it was, but I was glad that the movie "
 "wasn't too cleverly written as a whole.<br /><br />-End of Spelling "
 'section.<br /><br />PS: I also liked the main character. I think

### GPT-2 PPO Generator

In [None]:
from trl.gpt2 import GPT2HeadWithValueModel
from trl.gpt2 import respond_to_batch

from generator import GPT2PPOGenerator

gpt2_ppo_generator = GPT2PPOGenerator(device)

model = GPT2HeadWithValueModel.from_pretrained("/content/drive/My Drive/models/gpt2_ppo_imdb")
tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/My Drive/models/gpt2_ppo_imdb")
model.to(device)

prompt = "[positive] The movie is really"
pprint(gpt2_ppo_generator.generate(tokenizer, model, prompt))

prompt = "[negative] The movie is really"
pprint(gpt2_ppo_generator.generate(tokenizer, model, prompt))

('[positive] The movie is really good.\n'
 '\n'
 'It really amazes me.\n'
 '\n'
 "It is very different. It's a very funny film about guys who are trying to "
 "behave for their common interests. It's very different in scale and "
 "direction. I'm not a lot of sex story theorists. In fact I'm looking for "
 'growth on the train!\n'
 '\n'
 'Alexandra "Gemme" I saw here! The baby is really very extraordinary!\n'
 '\n'
 'It is really incredible! The movie is really funny! And probably in other '
 'folks the entire time but you will pick it up that it is just me talking (in '
 'a separate part of it will be my love show in all of it, because I really '
 "enjoy what she's changed I did as an out of contract – what they did last "
 'time. One of the biggest things I love about me being from that family - I '
 "didn't think I would talk to women. The one gave me that coal, probably "
 "curious one of the little out there! That's absolutely absorbing!\n"
 '\n'
 "For years I always thoughtful! T