In [1]:
import transformers
import os
import numpy as np
import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
dreams = pd.read_csv('C:\\Users\\bookw\\Dropbox\\Capstone\\DreamBERT\\00_data\\dream_text_corpus')
dream = dreams['dreams']

## First attempt from:
https://medium.com/analytics-vidhya/create-a-tokenizer-and-train-a-huggingface-roberta-model-from-scratch-f3ed1138180c

In [15]:
# Store values in a dataframe column (Series object) to files, one file per record
def column_to_files(column, prefix, txt_files_dir):
    # The prefix is a unique ID to avoid to overwrite a text file
    i=prefix
    #For every value in the df, with just one column
    for row in column.to_list():
      # Create the filename using the prefix ID
      file_name = os.path.join(txt_files_dir, str(i)+'.txt')
      try:
        # Create the file and write the column text to it
        f = open(file_name, 'wb')
        f.write(row.encode('utf-8'))
        f.close()
      except Exception as e:  #catch exceptions(for eg. empty rows)
        print(row, e) 
      i+=1
    # Return the last ID
    return i

In [17]:
path = '/Users/bookw/Dropbox/Capstone/data_files'
# Get the training data
data = dreams["dreams"]
# Removing the end of line character \n
data = data.replace("\n"," ")
# Set the ID to 0
prefix=0
print(len(data))
# Create a file for every description value
prefix = column_to_files(data, prefix, path)
# Print the last ID
print(prefix)

26401
26401


In [13]:
%%time 
paths = [str(x) for x in Path(".").glob("text_split/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files=paths, vocab_size=8192, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
])
#Save the Tokenizer to disk
tokenizer.save_model(path)

Wall time: 4.99 ms


['/Users/bookw/Dropbox/Capstone\\vocab.json',
 '/Users/bookw/Dropbox/Capstone\\merges.txt']

In [14]:
# Create the tokenizer using vocab.json and mrege.txt files
tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join(path,'vocab.json')),
    os.path.abspath(os.path.join(path,'merges.txt'))
)
# Prepare the tokenizer
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
# Test the tokenizer
tokenizer.encode("knit midi dress with vneckline straps.")
# Show the tokens created
tokenizer.encode("knit midi dress with vneckline straps.").tokens

['<s>',
 'k',
 'n',
 'i',
 't',
 'Ġ',
 'm',
 'i',
 'd',
 'i',
 'Ġ',
 'd',
 'r',
 'e',
 's',
 's',
 'Ġ',
 'w',
 'i',
 't',
 'h',
 'Ġ',
 'v',
 'n',
 'e',
 'c',
 'k',
 'l',
 'i',
 'n',
 'e',
 'Ġ',
 's',
 't',
 'r',
 'a',
 'p',
 's',
 '.',
 '</s>']

In [16]:
# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=8192,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  49816064


In [34]:
# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained(path, max_length=128)

In [26]:
# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [32]:
train_dataset = dreams['dreams'][0:25000]
eval_dataset = dreams['dreams'][25000:]

In [33]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir=path,
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=100,
    learning_rate=.0001,
    weight_decay=.01,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    save_steps=8192,
    #eval_steps=4096,
    save_total_limit=1,
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)
# Train the model
trainer.train()

AttributeError: 'str' object has no attribute 'size'

In [None]:
from transformers import pipeline
# Create a Fill mask pipeline
fill_mask = pipeline(
    "fill-mask",
    model=path,
    tokenizer=path
)
# Test some examples
# knit midi dress with vneckline
# =>
fill_mask("midi <mask> with vneckline.")
# The test text: Round neck sweater with long sleeves
fill_mask("Round neck sweater with <mask> sleeves.")

## Second attempt from:
https://huggingface.co/blog/how-to-train

In [2]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [18]:
paths = [str(x) for x in Path("./data_files/").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk
tokenizer.save_model(".", "DREAMbert")

['.\\DREAMbert-vocab.json', '.\\DREAMbert-merges.txt']

In [19]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [21]:
tokenizer = ByteLevelBPETokenizer(
    "DREAMbert-vocab.json",
    "DREAMbert-merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode("I had a dream.")
)

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [22]:
from torch.utils.data import Dataset

In [24]:
class DREAMbertDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "DREAMbert-vocab.json",
            "DREAMbert-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data_files/").glob("*-eval.txt") if evaluate else Path("./data_files/").glob("*-train.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [27]:
--output_dir './models/DREAMbert'
--model_type bert
--mlm
--do_train
--do_eval
--learning_rate 1e-4
--num_train_epochs 5
--save_total_limit 2
--save_steps 2000
--per_gpu_train_batch_size 16
--evaluate_during_training
--seed 42

SyntaxError: invalid syntax (<ipython-input-27-2a154e99a426>, line 1)

In [25]:
from transformers import pipeline



In [26]:
fill_mask = pipeline(
    "fill-mask",
    model="./models/DREAMbert",
    tokenizer="./models/DREAMbert"
)

result = fill_mask("I had a <mask>.")

404 Client Error: Not Found for url: https://huggingface.co/models/DREAMbert/resolve/main/config.json


OSError: Can't load config for './models/DREAMbert'. Make sure that:

- './models/DREAMbert' is a correct model identifier listed on 'https://huggingface.co/models'

- or './models/DREAMbert' is the correct path to a directory containing a config.json file



## Third attempt?
https://huggingface.co/docs/tokenizers/python/latest/quicktour.html#build-a-tokenizer-from-scratch

In [120]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace


tokenizer = ByteLevelBPETokenizer()
#trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#tokenizer.pre_tokenizer = Whitespace()

In [43]:
filess = []

for root, dirs, files in os.walk('C:\\Users\\bookw\\Dropbox\\Capstone\\DreamBERT\\20_preliminary_modeling\\data_files'):
    for file in files:
        filess.append(os.path.join(root, file))

In [121]:
tokenizer.train(filess, vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [95]:
os.mkdir('./dreambert')
tokenizer.save_model('dreambert')

['dreambert\\vocab.json', 'dreambert\\merges.txt']

In [122]:
from transformers import BertModel, BertConfig, BertTokenizer, BertForMaskedLM, AutoTokenizer
tokenizer = BertTokenizer.from_pretrained('dreambert', max_len=512)
#tokenizer = ByteLevelBPETokenizer(
#    "dreambert/vocab.json",
#    "dreambert/merges.txt",
#)

OSError: Can't load tokenizer for 'dreambert'. Make sure that:

- 'dreambert' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'dreambert' is the correct path to a directory containing relevant tokenizer files



In [89]:
with open('data_files/0.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [123]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=True)
len(batch)

TypeError: 'ByteLevelBPETokenizer' object is not callable

In [None]:
import torch

labels = torch.tensor([x.ids for x in batch])
mask = torch.tensor([x.attention_mask for x in batch])

In [None]:
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()
# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)
# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)
# loop through each row in input_ids tensor (cannot do in parallel)
for i in range(input_ids.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    input_ids[i, selection] = 3  # our custom [MASK] token == 3
    
    

input_ids.shape

In [None]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [None]:
dataset = Dataset(encodings)

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [84]:
config = BertConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)
model = BertForMaskedLM(config)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

In [77]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
from tqdm.auto import tqdm

epochs = 10

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [87]:
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

result = fill_mask("I had a <mask>.")

TypeError: 'tokenizers.Tokenizer' object is not callable

## Four

In [None]:
DREAMbert_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents

DREAMbert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

from tokenizers.pre_tokenizers import Whitespace

DREAMbert_tokenizer.pre_tokenizer = Whitespace()


from tokenizers.processors import TemplateProcessing

DREAMbert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)



from tokenizers.trainers import WordPieceTrainer

trainer = BpeTrainer(
    vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

DREAMbert_tokenizer.train(filess, trainer)

In [109]:
DREAMbert_tokenizer.save("dreambert.json")

In [112]:
DREAMbert_tokenizer=Tokenizer.from_file("dreambert.json")

## Five: From Pretrained
https://huggingface.co/transformers/training.html

In [3]:
from transformers import AutoTokenizer, BertModel, BertConfig, BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
#inputs = tokenizer(filess, padding="max_length", truncation=True)
small_train_dataset = dreams['dreams'][0:100]
small_eval_dataset = dreams['dreams'][100:200]
train_dataset = dreams['dreams'][0:25000]
eval_dataset = dreams['dreams'][25000:]

In [5]:
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True)

tok_small_train_dataset = []
for i in small_train_dataset:
    tok_small_train_dataset.append(tokenize_function(i))
    
tok_small_eval_dataset = []
for i in small_eval_dataset:
    tok_small_eval_dataset.append(tokenize_function(i))
    
tok_train_dataset = []
for i in train_dataset:
    tok_train_dataset.append(tokenize_function(i))
    
tok_eval_dataset = []
for i in eval_dataset:
    tok_eval_dataset.append(tokenize_function(i))

In [6]:
model = BertForMaskedLM.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=tok_small_train_dataset, eval_dataset=tok_small_eval_dataset
)

In [9]:
#I get an error that just says "loss" (do I need to specify a loss function since there are no labels?) and then when I run it again I get a "CUDA out of memory" error
#this happens here, on Google colab, and in ACCRE
#am I actually utilizing my GPUs?
trainer.train()

RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 6.00 GiB total capacity; 4.66 GiB already allocated; 0 bytes free; 4.70 GiB reserved in total by PyTorch)