## One-time installs

In [None]:
!nvidia-smi

#### Download and install NVIDIA Apex

In [None]:
!git clone https://github.com/NVIDIA/apex
% cd apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
% cd /content

#### Clone & install pretrained BERT repo

In [None]:
! git clone https://github.com/huggingface/pytorch-pretrained-BERT

import sys
sys.path.insert(0, 'pytorch-pretrained-BERT')

! pip install regex

## 2. Main routine

In [None]:
import os
import sys
import shutil
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import logging
from datetime import datetime
import multiprocessing
import requests
import zipfile

import torch
from torch.utils.data import TensorDataset, DataLoader

from apex import amp

sys.path.insert(0, 'pytorch-pretrained-BERT')
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam
from pytorch_pretrained_bert import BertConfig

print(f'n_cpus={multiprocessing.cpu_count()}')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device={device}, '+
      f'type: {torch.cuda.get_device_name(device)}, ' +
      f'CUDA capability: {torch.cuda.get_device_capability(device)}')

log_date = datetime.now().strftime('%m%d-%H%M')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s: %(message)s',
                    datefmt='%H:%M:%S',
                    filename='/content/BERT-' + log_date + '.txt',
                    filemode='w')

logger1 = logging.getLogger('')

#### Retrieve and init model

In [None]:
MODELS_DIR = '/content/models/BERT/'
SOURCE_PATH = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip'

  # BASE models: 12 layers, Hdim 768, 12 heads
  # 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip'
  # 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip'

  # LARGE models: 24 layers, Hdim 1024, 16 heads
  # 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip'
  # 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip'

  # large, whole word masking
  # https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
  # https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip

model_fname = SOURCE_PATH.split('/')[-1]
model_path = os.path.join(MODELS_DIR, model_fname.split('.')[0])

if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)

    r = requests.get(SOURCE_PATH, stream=True)
    with open(os.path.join(MODELS_DIR, model_fname), 'wb') as f:
        file_size = int(r.headers["content-length"])
        chunk_size = 1000
        for chunk in r.iter_content(chunk_size=chunk_size):
            f.write(chunk)

    with zipfile.ZipFile(os.path.join(MODELS_DIR, model_fname), 'r') as f:
        f.extractall(os.path.join(MODELS_DIR))
    os.remove(os.path.join(MODELS_DIR, model_fname))
    shutil.move(model_path + '/bert_config.json', model_path + '/config.json')
    
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    model_path + '/bert_model.ckpt',
    model_path + '/config.json',
    model_path + '/pytorch_model.bin')

#### Load & process data

In [None]:
def get_inputs(df_in, train_val_split):
    # Returns: train_texts, train_labels, val_texts, val_labels
    #    ( _texts: np.array of str )
    #    ( labels: np.array of np.int64 )

    # ...LOADER CODE...
    
    return train_texts, train_labels, val_texts, val_labels

train_texts, train_labels, val_texts, val_labels = get_inputs(train1, 0.8)

print(f'train records: {len(train_texts)} val records: {len(val_texts)}')

#### Tokenize train text samples

In [None]:
MAX_LEN = 256

tokenizer = BertTokenizer.from_pretrained(model_path, cache_dir=None,do_lower_case=True)

def tokenize(sentences, tokenizer, max_len):
    ret = np.zeros((len(sentences), max_len))
    for i, sentence in enumerate(tqdm(sentences, mininterval=10)):
        tokens = tokenizer.tokenize(sentence)[:max_len-2]
        indexed_tokens = tokenizer.convert_tokens_to_ids(['[CLS]'] + tokens + ['[SEP]'])
        ret[i, :len(indexed_tokens)] = indexed_tokens
    return ret

Xt = tokenize(train_df.comment_text.values, tokenizer, MAX_LEN)
yt = train_df.target_int.values
train_ds = TensorDataset(torch.tensor(Xt,dtype=torch.long), torch.tensor(yt,dtype=torch.float))

#### Init model

In [None]:
model = BertForSequenceClassification.from_pretrained(model_path, cache_dir=None, num_labels=1)
model = model.to(device)

model_params = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

#### Train

In [None]:
BATCH_SIZE = 32
N_EPOCHS = 1
ETA = 0.000004
log_interval = 100
OPT_LEVEL = 'O1'
accumulation_steps = 2
log_interval = 500

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=ETA,
                     warmup=0.05,
                     t_total=N_EPOCHS * np.ceil(train_df.shape[0] / BATCH_SIZE))

model, optimizer = amp.initialize(model, optimizer, opt_level=OPT_LEVEL, verbosity=1)

model = model.train()

xentropy = torch.nn.functional.binary_cross_entropy_with_logits

losses = []

model.zero_grad()

logger1.info(f'train hparams:\n   train recs: {train_df.shape[0]:,}\n'
             + f'   max_len: {MAX_LEN}\n   n_epochs: {N_EPOCHS}\n'
             + f'   batch size: {BATCH_SIZE}\n   eta: {ETA}\n'
             + f'   accumulation steps: {accumulation_steps}\n'
             + f'   opt_level: {OPT_LEVEL}'
           )

for epoch in range(N_EPOCHS):
    logger1.info(f'epoch {epoch+1} of {N_EPOCHS} training:')

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    n_batches = len(train_loader)

    optimizer.zero_grad()
    tq = tqdm(enumerate(train_loader), total=n_batches, mininterval=30, maxinterval=60)
    loss_smoothed = None
    for step, batch in tq:
        X_batch = batch[0].to(device)
        y_batch = batch[1].view(-1,1).to(device)
        mask = (X_batch>0).to(device)
        y_pred = model(X_batch, attention_mask=mask, labels=None)        

        loss = xentropy(y_pred, y_batch).to(device)
        
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        if (step+1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        batch_loss = loss.item()
        
        losses.append(batch_loss)

        if (step+1) % log_interval == 0:
            loss_mean  = sum(losses[(step+1-log_interval):]) / log_interval
            logstr = f'step {step+1} of {n_batches} loss {loss_mean:.4f} lr {optimizer.get_lr()[0]}'
            logger1.info(logstr)
            tq.set_postfix(loss=loss_mean)

logger1.info('train complete.')  #0.169 38%  0.165 46%

#### Save trained model
(optional)

In [None]:
mkdir /content/saved_models/
FNAME = '/content/saved_models/bert_pytorch.bin'
torch.save(model.state_dict(), FNAME)
#model.load_state_dict(torch.load(output_model_file ))

#### Plot losses

In [None]:
# train 438k eta 0.000004 batch 32 accum 2 loss 0.165 AUC 0.9536
plt.plot(losses, 'o', color='b', ms=1);

#### Tokenize eval set text samples

In [None]:
Xv = tokenize(val_df.comment_text.values, tokenizer, MAX_LEN)
yv = val_df.target_int.values
val_ds = TensorDataset(torch.tensor(Xv,dtype=torch.long), torch.tensor(yv,dtype=torch.float))

#### Load trained model
Skip this step if training and evaluating in the same runtime.

In [None]:
SAVED_MODEL_FNAME = 'bert_pytorch.bin'
SAVED_MODEL_DIR = '/content/saved_models'
model.load_state_dict(torch.load(os.path.join(SAVED_MODEL_DIR, SAVED_MODEL_FNAME)))
logger1.info(f'Model loaded from {SAVED_MODEL_DIR + "/" + SAVED_MODEL_FNAME}')

#### Generate eval set predictions

In [None]:
model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.eval()

val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

val_preds = []

tq = tqdm(enumerate(val_loader), total=len(val_loader), mininterval=10)
for i, batch in tq:
    X_batch = batch[0].to(device)
    logits_batch = model(X_batch,
                         attention_mask=(X_batch>0).to(device),
                         labels=None)
    preds_batch = torch.sigmoid(logits_batch[:, 0])
    preds_batch = preds_batch.detach().cpu().squeeze().tolist()
    val_preds.extend(preds_batch)

#### Eval metrics

In [None]:
val_preds = np.array(val_preds)
val_AUC = metrics.roc_auc_score(val_labels, val_preds)
val_acc = metrics.accuracy_score(val_labels.astype('int'), val_preds.astype('int'))
logger1.info(f'Eval set AUC = {val_AUC:.4f}')
logger1.info(f'Eval set accuracy = {val_acc:.4f}')
print(f'AUC {val_AUC:.4f}, acc {val_acc:.4f}')

In [None]:
plt.hist(val_preds, bins=50);