# Train BERT-base from scratch

I'm try to train my own BERT-base model, then adjust some structure.

Codes mostly from: https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891

## Download Dataset

In [1]:
import os
import random
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import torch
from torch.utils.data import DataLoader

from bert_base import BERTDataset, BERTLM, BERT, BERTTrainer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## DOWNLOAD DATASET ##

if not os.path.exists('../datasets'):
    os.system('mkdir ../datasets')
    os.system('wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip -O ../datasets/cornell_movie_dialogs_corpus.zip -q')
    os.system('unzip -qq ../datasets/cornell_movie_dialogs_corpus.zip -d ../datasets')
    os.system('mv ../datasets/cornell\ movie-dialogs\ corpus/movie_conversations.txt ../datasets')
    os.system('mv ../datasets/cornell\ movie-dialogs\ corpus/movie_lines.txt ../datasets')
    os.system('rm ../datasets/cornell_movie_dialogs_corpus.zip')
    os.system('rm -rf ../datasets/cornell\ movie-dialogs\ corpus')
    os.system('rm -rf ../datasets/__MACOSX')

print('Dataset Prepared!')
  

Dataset Prepared!


In [3]:
## SET HYPERPARAMS ##

MAX_LEN = 64
BATCH_SIZE = 32
LEARNING_RATE= 1e-4
WEIGHT_DECAY=0.01
BETAS=(0.9, 0.999)
TRAIN_RATIO = 0.9

## SET DEVICE ##
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', DEVICE)
if DEVICE != 'cuda':
    print('Warning: CPU is not recommended!')

Device: cuda


In [4]:
## PREPARE DATASET ##

### loading all data into memory
corpus_movie_conv = '../datasets/movie_conversations.txt'
corpus_movie_lines = '../datasets/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

### splitting text using special lines
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

### generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []
        
        if i == len(ids) - 1:
            break

        first = lines_dic[ids[i]].strip()  
        second = lines_dic[ids[i+1]].strip() 

        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)

### check some pairs
print(pairs[20])

### split dataset into train and test
random.shuffle(pairs)
train_size = int(len(pairs) * TRAIN_RATIO)
train_pairs = pairs[:train_size]
test_pairs = pairs[train_size:]


["I really, really, really wanna go, but I can't. Not unless my sister goes.", "I'm workin' on it. But she doesn't seem to be goin' for him."]


In [5]:
## PREPARE TRAINING ##

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_data = BERTDataset(train_pairs, seq_len=MAX_LEN, tokenizer=tokenizer)
test_data = BERTDataset(test_pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

sample_data = next(iter(train_loader))
print(train_data[random.randrange(len(train_data))])

{'bert_input': tensor([ 101, 1045, 1005, 1049, 2183, 2000, 2681,  103,  103,  102, 2848, 1010,
        2017, 2481, 1005, 1056, 2130, 8081, 2256, 2694,  999,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]), 'bert_label': tensor([   0,    0,    0,    0,    0,    0,    0, 2085, 1012,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]), 'segment_label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0,
      

In [6]:
## TRAINING ##

bert_model = BERT(
  vocab_size=len(tokenizer.vocab),
  d_model=768,
  n_layers=2,
  heads=12,
  dropout=0.1,
  device=DEVICE
)

bert_lm = BERTLM(bert_model, len(tokenizer.vocab), device=DEVICE)
bert_trainer = BERTTrainer(bert_lm, train_loader, test_loader, device=DEVICE, lr=LEARNING_RATE, betas=BETAS, weight_decay=WEIGHT_DECAY, log_freq=1)
epochs = 20

for epoch in range(epochs):
  bert_trainer.train(epoch)
  bert_trainer.test(epoch)


Total Parameters: 61088828


EP_train:0: 100%|| 6233/6233 [39:49<00:00,  2.61it/s, avg_loss=6.17, avg_acc=50, loss=6.35, acc=56.7]  


EP0, train:             avg_loss=6.167640882262865,             total_acc=49.98997262526698


TypeError: test() missing 1 required positional argument: 'epoch'

In [None]:
## SAVE MODEL ##
torch.save(bert_model.state_dict(), 'bert_base.pth')