In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# !rm *.txt
# !rm cached*
# !rm -rf wandb/
# !rm -ef runs/

In [1]:
!pip install transformers
!pip install dict_to_obj
!pip install wandb

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.6MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 13.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 30.0MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K 

In [2]:
!cp /content/drive/My\ Drive/UCSC/Courses/CSE115A/cse115a_group_project/data/imdb/train.txt .
!cp /content/drive/My\ Drive/UCSC/Courses/CSE115A/cse115a_group_project/data/imdb/val.txt .
!cp /content/drive/My\ Drive/UCSC/Courses/CSE115A/cse115a_group_project/data/imdb/test.txt .

In [None]:
import os
import math
import random
import logging
import warnings
import collections
import wandb
from dict_to_obj import DictToObj

warnings.filterwarnings("ignore")
wandb.login()

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

sep = "<|sep|>"


def get_dataset(args, tokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size
        )
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
        )


# Logging
logger = logging.getLogger(__name__)
# Model classes
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

# These arguments could have been handled by CLI, but I put them in this
# way to make the code simpler.

# Model arguments
model_args = collections.defaultdict(
    config_name="gpt2",
    model_name_or_path="gpt2",
    model_type="gpt2",
    tokenizer_name="gpt2",
    cache_dir=None,
)
# Data arguments
data_args = collections.defaultdict(
    train_data_file="train.txt",
    eval_data_file="val.txt",
    line_by_line=False,
    mlm=False,
    mlm_probability=0.15,
    block_size=512,
    overwrite_cache=False,
)
# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/models/gpt2_imdb",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=False,
    evaluate_during_training=True,
    per_gpu_train_batch_size=8,
    per_gpu_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    weight_decay=0.0,
    adam_epsilon=1e-08,
    max_grad_norm=1.0,
    num_train_epochs=5,
    max_steps=-1,
    warmup_steps=0,
    logging_dir=None,
    logging_first_step=False,
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=100000,
    no_cuda=False,
    seed=42,
    fp16=False,
    fp16_opt_level="O1",
    local_rank=-1,
)
# Convert dict to objects
model_args = DictToObj(model_args)
data_args = DictToObj(data_args)

# Logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

# Seed
set_seed(training_args.seed)

# Load tokenizer and model
config = AutoConfig.from_pretrained(
    model_args.model_name_or_path, cache_dir=model_args.cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, cache_dir=model_args.cache_dir
)
model = AutoModelWithLMHead.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
)

# Add special tokens
tokenizer.add_special_tokens({"sep_token": sep})
model.resize_token_embeddings(len(tokenizer))

# Load dataset
train_dataset = (
    get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
)

eval_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
    if training_args.do_eval
    else None
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    prediction_loss_only=True,
)

# Define model path
model_path = (
    model_args.model_name_or_path
    if model_args.model_name_or_path is not None
    and os.path.isdir(model_args.model_name_or_path)
    else None
)

# Train the model
train_results = trainer.train(model_path=model_path)
trainer.save_model()
if trainer.is_world_master():
    tokenizer.save_pretrained(training_args.output_dir)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
07/07/2020 16:53:37 - INFO - transformers.training_args -   PyTorch: setting up devices
07/07/2020 16:53:37 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/content/drive/My Drive/models/gpt2_imdb', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=True, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=8, per_gpu_eval_batch_size=8, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=1000, save_steps=1000, save_total_limit=100000, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=1000, past_index=-1)
07/07/

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…

07/07/2020 16:53:39 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json in cache at /root/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.db13c9bc9c7bdd738ec89e069621d88e05dc670366092d809a9cbcac6798e24e
07/07/2020 16:53:39 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.db13c9bc9c7bdd738ec89e069621d88e05dc670366092d809a9cbcac6798e24e
07/07/2020 16:53:39 - INFO - filelock -   Lock 140102306230680 released on /root/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.db13c9bc9c7bdd738ec89e069621d88e05dc670366092d809a9cbcac6798e24e.lock
07/07/2020 16:53:39 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /root/.cache/torch/transform




07/07/2020 16:53:39 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /root/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.db13c9bc9c7bdd738ec89e069621d88e05dc670366092d809a9cbcac6798e24e
07/07/2020 16:53:39 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-gene

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…

07/07/2020 16:53:42 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json in cache at /root/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
07/07/2020 16:53:42 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
07/07/2020 16:53:42 - INFO - filelock -   Lock 140099895503112 released on /root/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71.lock





07/07/2020 16:53:43 - INFO - filelock -   Lock 140099895503672 acquired on /root/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock
07/07/2020 16:53:43 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpe_xrk24t


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

07/07/2020 16:53:45 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt in cache at /root/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
07/07/2020 16:53:45 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
07/07/2020 16:53:45 - INFO - filelock -   Lock 140099895503672 released on /root/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock
07/07/2020 16:53:45 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /root/.cache/torch/transformers/f2808208




07/07/2020 16:53:46 - INFO - filelock -   Lock 140099739692392 acquired on /root/.cache/torch/transformers/d71fd633e58263bd5e91dd3bde9f658bafd81e11ece622be6a3c2e4d42d8fd89.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1.lock
07/07/2020 16:53:46 - INFO - transformers.file_utils -   https://cdn.huggingface.co/gpt2-pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpgpjooaig


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…

07/07/2020 16:53:52 - INFO - transformers.file_utils -   storing https://cdn.huggingface.co/gpt2-pytorch_model.bin in cache at /root/.cache/torch/transformers/d71fd633e58263bd5e91dd3bde9f658bafd81e11ece622be6a3c2e4d42d8fd89.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
07/07/2020 16:53:52 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/d71fd633e58263bd5e91dd3bde9f658bafd81e11ece622be6a3c2e4d42d8fd89.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
07/07/2020 16:53:52 - INFO - filelock -   Lock 140099739692392 released on /root/.cache/torch/transformers/d71fd633e58263bd5e91dd3bde9f658bafd81e11ece622be6a3c2e4d42d8fd89.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1.lock
07/07/2020 16:53:52 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/gpt2-pytorch_model.bin from cache at /root/.cache/torch/transformers/d71fd633e58263bd5e91dd3bde9f658bafd81e11ece62




07/07/2020 16:53:56 - INFO - transformers.modeling_utils -   All model checkpoint weights were used when initializing GPT2LMHeadModel.

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
07/07/2020 16:53:56 - INFO - transformers.tokenization_utils_base -   Assigning <|sep|> to the sep_token key of the tokenizer
07/07/2020 16:53:56 - INFO - transformers.tokenization_utils -   Adding <|sep|> to the vocabulary
07/07/2020 16:53:57 - INFO - filelock -   Lock 140099745740000 acquired on cached_lm_GPT2Tokenizer_512_train.txt.lock
07/07/2020 16:53:57 - INFO - transformers.data.datasets.language_modeling -   Creating features from dataset file at 
07/07/2020 16:54:22 - INFO - transformers.data.datasets.language_modeling -   Saving features into cached file cached_lm_GPT2Tokenizer_512_train.txt [took 0.138 s]
07/07/2020 16:54:22 - INFO - filelock -   Lock 140099745740000 released on cached_lm_GPT2Tokenizer_512_train.txt.lock
07/07/2020 

07/07/2020 16:54:44 - INFO - wandb.run_manager -   system metrics and metadata threads started
07/07/2020 16:54:44 - INFO - wandb.run_manager -   checking resume status, waiting at most 10 seconds
07/07/2020 16:54:44 - INFO - wandb.run_manager -   resuming run from id: UnVuOnYxOmtpOG85eGYxOmh1Z2dpbmdmYWNlOnlsaXUyOTg=
07/07/2020 16:54:44 - INFO - wandb.run_manager -   upserting run before process can begin, waiting at most 10 seconds


In [None]:
!wandb off

In [None]:
# Evaluation on validation set
logger.info("*** Valid Evaluate ***")
valid_eval_output = trainer.evaluate()
valid_perplexity = math.exp(valid_eval_output["eval_loss"])
valid_result = {"valid_perplexity": valid_perplexity}
output_eval_file = os.path.join(training_args.output_dir, "valid_eval_results_lm.txt")

with open(output_eval_file, "w") as writer:
    logger.info("***** Valid Eval results *****")
    for key in sorted(valid_result.keys()):
        logger.info("  %s = %s", key, str(valid_result[key]))
        writer.write("%s = %s\n" % (key, str(valid_result[key])))


# Evaluation on test set
training_args.do_eval = True
data_args.eval_data_file = "test.txt"
test_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
    if training_args.do_eval
    else None
)
trainer.eval_dataset = test_dataset

logger.info("*** Test Evaluate ***")
test_eval_output = trainer.evaluate()
test_perplexity = math.exp(test_eval_output["eval_loss"])
test_result = {"test_perplexity": test_perplexity}
output_eval_file = os.path.join(training_args.output_dir, "test_eval_results_lm.txt")

with open(output_eval_file, "w") as writer:
    logger.info("***** Test Eval results *****")
    for key in sorted(test_result.keys()):
        logger.info("  %s = %s", key, str(test_result[key]))
        writer.write("%s = %s\n" % (key, str(test_result[key])))


# Evaluation on training set
data_args.eval_data_file = "train.txt"
test_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
    if training_args.do_eval
    else None
)
trainer.eval_dataset = test_dataset

logger.info("*** Train Evaluate ***")
train_eval_output = trainer.evaluate()
train_perplexity = math.exp(train_eval_output["eval_loss"])
train_result = {"train_perplexity": train_perplexity}
output_eval_file = os.path.join(training_args.output_dir, "train_eval_results_lm.txt")

with open(output_eval_file, "w") as writer:
    logger.info("***** Train Eval results *****")
    for key in sorted(train_result.keys()):
        logger.info("  %s = %s", key, str(train_result[key]))
        writer.write("%s = %s\n" % (key, str(train_result[key])))


print(f"Train loss: {train_eval_output['eval_loss']}")
print(f"Valid loss: {valid_eval_output['eval_loss']}")
print(f"Test loss: {test_eval_output['eval_loss']}")
print(f"Train PPL: {train_perplexity}")
print(f"Valid PPL: {valid_perplexity}")
print(f"Test PPL: {test_perplexity}")
