In [1]:
import pandas as pd
from pathlib import Path
import logging
from box import Box
from datetime import datetime
import sys
import torch

In [2]:
# !pip uninstall fastai

In [3]:
from fast_bert import BertLearner, BertDataBunch, accuracy

In [4]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [5]:
PATH = Path("../sample_data/imdb_movie_reviews")
DATA_PATH = PATH/'data'
LABEL_PATH = PATH/'label'
OUT_PATH = PATH/'.output'
OUT_PATH.mkdir(exist_ok=True)

MODEL_PATH=OUT_PATH/'model'
MODEL_PATH.mkdir(exist_ok=True)

LOG_PATH = OUT_PATH/'logs/'
LOG_PATH.mkdir(exist_ok=True)

In [6]:
args = Box({
    "run_text": "ibdm_reviews",
    "max_seq_length": 512,
    "batch_size": 8,
    "learning_rate": 5e-3,
    "num_train_epochs": 6,
    "fp16": False,
    "model_name": 'distilroberta-base',
    "model_type": 'roberta'
})

device = torch.device('cuda') if torch.cuda.device_count() else torch.device('cpu')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

In [7]:
logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [8]:
device

device(type='cpu')

In [9]:
databunch = BertDataBunch(DATA_PATH, LABEL_PATH, args.model_name, 
                          train_file="train_sample.csv", val_file="val_sample.csv",
                          batch_size_per_gpu=args.batch_size, 
                          max_seq_length=args.max_seq_length, 
                          multi_gpu=args.multi_gpu,
                          multi_label=False,
                          model_type=args.model_type
                         )

12/15/2019 01:52:02 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json from cache at /Users/kaushaltrivedi/.cache/torch/transformers/5f11352d3c3e932888f3ba75bc24579eacb5d1596d39ce56166aeae8fd363df8.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
12/15/2019 01:52:02 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt from cache at /Users/kaushaltrivedi/.cache/torch/transformers/01f63a14ad93494c050af2090c59930fb787bdfb347c4cad7ce9063e1a5fe140.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
12/15/2019 01:52:02 - INFO - root -   Loading features from cached file ../sample_data/imdb_movie_reviews/data/cache/cached_roberta_train_multi_class_512_train_sample.csv
12/15/2019 01:52:02 - INFO - root -   Loading features from cached file ../sample_data/imdb_movie_reviews/data/cache/cached_r

In [10]:
metrics = [{"name": "accuracy", "function": accuracy}]

In [11]:
learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, 
                                            device=device, multi_gpu=args.multi_gpu, is_fp16=args.fp16,
                                            multi_label=False, logging_steps=0,
                                            output_dir=OUT_PATH, logger=logger
                                           )

12/15/2019 01:52:03 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json from cache at /Users/kaushaltrivedi/.cache/torch/transformers/d52ced8fd31ba6aa311b6eeeae65178cca00ddd6333c087be4601dc46c20bd96.0cc9c825897545d1d8c78f2343db2a450d3531eb9b0fb77a96cc487ebbb74210
12/15/2019 01:52:03 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "use_bfloat16": false,
  "vocab_siz

In [12]:
learner.validate()

12/15/2019 01:52:08 - INFO - root -   Running evaluation
12/15/2019 01:52:08 - INFO - root -     Num examples = 50
12/15/2019 01:52:08 - INFO - root -     Batch size = 16


{'loss': 0.6959984004497528, 'accuracy': 0.38}

In [13]:
learner.fit(2, args.learning_rate, validate=True)

../sample_data/imdb_movie_reviews/.output/tensorboard
12/15/2019 01:52:41 - INFO - root -   ***** Running training *****
12/15/2019 01:52:41 - INFO - root -     Num examples = 100
12/15/2019 01:52:41 - INFO - root -     Num Epochs = 2
12/15/2019 01:52:41 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
12/15/2019 01:52:41 - INFO - root -     Gradient Accumulation steps = 1
12/15/2019 01:52:41 - INFO - root -     Total optimization steps = 26


12/15/2019 01:56:18 - INFO - root -   Running evaluation
12/15/2019 01:56:18 - INFO - root -     Num examples = 50
12/15/2019 01:56:18 - INFO - root -     Batch size = 16


12/15/2019 01:56:57 - INFO - root -   eval_loss after epoch 1: 0.47291456162929535: 
12/15/2019 01:56:57 - INFO - root -   eval_accuracy after epoch 1: 0.82: 
12/15/2019 01:56:57 - INFO - root -   lr after epoch 1: 0.0025
12/15/2019 01:56:57 - INFO - root -   train_loss after epoch 1: 0.6882386070031387
12/15/2019 01:56:57 - INFO - root -   

12/15/2019 02:01:45 - INFO - root -   Running evaluation
12/15/2019 02:01:45 - INFO - root -     Num examples = 50
12/15/2019 02:01:45 - INFO - root -     Batch size = 16


12/15/2019 02:02:21 - INFO - root -   eval_loss after epoch 2: 0.3704464165493846: 
12/15/2019 02:02:21 - INFO - root -   eval_accuracy after epoch 2: 0.84: 
12/15/2019 02:02:21 - INFO - root -   lr after epoch 2: 0.0
12/15/2019 02:02:21 - INFO - root -   train_loss after epoch 2: 0.23913450768360725
12/15/2019 02:02:21 - INFO - root -   



(26, 0.46368655734337294)

In [14]:
learner.save_model()

12/15/2019 02:02:22 - INFO - transformers.configuration_utils -   Configuration saved in ../sample_data/imdb_movie_reviews/.output/model_out/config.json
12/15/2019 02:02:22 - INFO - transformers.modeling_utils -   Model weights saved in ../sample_data/imdb_movie_reviews/.output/model_out/pytorch_model.bin


In [15]:
from fast_bert.prediction import BertClassificationPredictor

In [16]:
predictor = BertClassificationPredictor(OUT_PATH/'model_out', LABEL_PATH, multi_label=False, model_type=args.model_type)

12/15/2019 02:02:22 - INFO - transformers.tokenization_utils -   Model name '../sample_data/imdb_movie_reviews/.output/model_out' not found in model shortcut name list (roberta-base, roberta-large, roberta-large-mnli, distilroberta-base, roberta-base-openai-detector, roberta-large-openai-detector). Assuming '../sample_data/imdb_movie_reviews/.output/model_out' is a path or url to a directory containing tokenizer files.
12/15/2019 02:02:22 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/model_out/vocab.json
12/15/2019 02:02:22 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/model_out/merges.txt
12/15/2019 02:02:22 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/model_out/added_tokens.json
12/15/2019 02:02:22 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/model_out/special_tokens_ma

In [18]:
predictor.predict_batch(["i hate you"])

12/15/2019 02:03:05 - INFO - root -   Writing example 0 of 1


[[('0', 0.934996485710144), ('1', 0.06500352919101715)]]