In [1]:
# Make tokens

In [2]:
import sys


In [3]:
for i in ['/opt/homebrew/Cellar/python@3.9/3.9.15/Frameworks/Python.framework/Versions/3.9/lib/python39.zip', 
 '/opt/homebrew/Cellar/python@3.9/3.9.15/Frameworks/Python.framework/Versions/3.9/lib/python3.9', 
 '/opt/homebrew/Cellar/python@3.9/3.9.15/Frameworks/Python.framework/Versions/3.9/lib/python3.9/lib-dynload', 
 '/Users/christianbromley/.virtualenvs/machine-learning-course/lib/python3.9/site-packages']:
    sys.path.insert(0, i)


In [4]:

import argparse
from pathlib import Path

#sys.path.insert(0,'./.env/lib/python3.8/site-packages')
from tokenizers import ByteLevelBPETokenizer
#from transformers import RobertaTokenizer
import os

from transformers import RobertaTokenizer, RobertaConfig, RobertaForMaskedLM, AdamW
import torch
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# constants
PDF_PATH='/Users/christianbromley/Documents/Personal/PhD/Christian_Bromley_Final_Thesis_20211221_cover.pdf'
OUTPUT_PATH='/Users/christianbromley/Documents/Personal/PhD/Christian_Bromley_Final_Thesis_20211221.csv'
TEXT_PARSE_DIR='/Users/christianbromley/Documents/Personal/PhD/thesis_parsed'
MODELS_DIR='/Users/christianbromley/Documents/Projects/machine-learning-course/bert-transformer/models'

In [6]:
def extract_file_paths(input_dir):
    paths = [str(x) for x in Path(input_dir).glob('**/*.txt')]
    return paths

def run(paths, output_dir):
    # intialise tokeniser
    tokenizer = ByteLevelBPETokenizer()
    # train
    tokenizer.train(files=paths,
                    vocab_size=30_522,
                    min_frequency=2,
                    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

    tokenizer.save_model(output_dir)

In [7]:
paths = extract_file_paths(input_dir=TEXT_PARSE_DIR)

paths

['/Users/christianbromley/Documents/Personal/PhD/thesis_parsed/text_0.txt']

In [8]:
run(paths, MODELS_DIR)






In [9]:
# initialize the tokenizer using the tokenizer we created
tokenizer = RobertaTokenizer.from_pretrained(MODELS_DIR, max_len=512)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [10]:
def masked_language_model(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i,selection] = 4
    return tensor

In [11]:
## input ids - token IDs with a % of tokens masked with the mask token ID which in our case is 4
input_ids = []
## mask - this is a binary tensor of 1 and 0 indicating where the masks are
mask = []
## labels - these are just the unmasked token IDs
labels = []
# initialise the output
tokenized_text = {}

In [12]:
# read file
with open(paths[0], 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [13]:
lines

['13Abstract  Cancer-related inflammation has classically been linked with tumour progression and therapy resistance.',
 'T cell infiltration, a hallmark of so-called ‘hot’ inflamed tumours, is however associated with better prognosis and response to immune checkpoint blockade (ICB).',
 'As such, inflammation can paradoxically promote and restrain tumour growth.',
 'Yet, whether and how cancer-promoting inflammation impacts T cell-mediated tumour immunity is unclear.',
 'Understanding the interplay of different inflammatory phenotypes is therefore a priority for the field.',
 'The adoption of next generation sequencing technologies has fuelled a rapid increase in the amount of publicly available data from cancer and healthy tissue samples as well as pre-clinical models.',
 'So far, pan-cancer meta-analyses using immunogenomic and transcriptomic data have focussed primarily on elucidating the characteristics of T cell-inflamed tumours, whilst systematic analyses of pro-tumourigenic infl

In [14]:
# get the file name
fname = paths[0].split('/')[-1]
fname

'text_0.txt'

In [15]:
# tokenize the text in these lines
sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')

In [16]:
sample.keys()

dict_keys(['input_ids', 'attention_mask'])

In [17]:
tokenized_text[fname] = sample

In [18]:
# the sample object contains some of our tensors - extract these
## get the input IDs and append to labels
labels.append(sample.input_ids)
## get the attention mask - the binary
mask.append(sample.attention_mask)


In [19]:
sample.input_ids.detach().clone()

tensor([[   0, 1202,   37,  ...,    1,    1,    1],
        [   0,   56,  309,  ...,    1,    1,    1],
        [   0,  892,  568,  ...,    1,    1,    1],
        ...,
        [   0, 1543,  261,  ...,    1,    1,    1],
        [   0, 3796,   93,  ...,    1,    1,    1],
        [   0,  564, 2787,  ...,    1,    1,    1]])

In [20]:
## now apply the masked language model function on the input IDs to mask 15% of tokens
mlm_on_input = masked_language_model(sample.input_ids.detach().clone())
input_ids.append(mlm_on_input)

In [21]:
# construct the output
encodings = {
    'input_ids': input_ids[0],
    'mask': mask[0],
    'labels': labels[0]
}

In [22]:
len(encodings['labels'])

1868

In [23]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        return {
            'input_ids': self.encodings['input_ids'][i]
        }

In [24]:
def create_data_loader(encodings):
    dataset = Dataset(encodings)
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
    return data_loader

In [25]:
loader = create_data_loader(encodings)

In [26]:
dir(loader)

['_DataLoader__initialized',
 '_DataLoader__multiprocessing_context',
 '_IterableDataset_len_called',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_auto_collation',
 '_dataset_kind',
 '_get_iterator',
 '_index_sampler',
 '_is_protocol',
 '_iterator',
 'batch_sampler',
 'batch_size',
 'check_worker_number_rationality',
 'collate_fn',
 'dataset',
 'drop_last',
 'generator',
 'multiprocessing_context',
 'num_workers',
 'persistent_workers',
 'pin_memory',
 'pin_memory_device',
 'prefetch_factor',
 'sampler',
 'timeout',
 'worker_init_fn']

In [27]:
loader.dataset.encodings

{'input_ids': tensor([[   0, 1202,   37,  ...,    1,    1,    1],
         [   0,   56,  309,  ...,    1,    1,    1],
         [   0,  892,  568,  ...,    1,    1,    1],
         ...,
         [   0, 1543,  261,  ...,    1,    1,    1],
         [   0, 3796,   93,  ...,    1,    1,    1],
         [   0,  564, 2787,  ...,    1,    1,    1]]),
 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[   0, 1202,   37,  ...,    1,    1,    1],
         [   0,   56,  309,  ...,    1,    1,    1],
         [   0,  892,  568,  ...,    1,    1,    1],
         ...,
         [   0, 1543,  261,  ...,    1,    1,    1],
         [   0, 3796,   93,  ...,    1,    1,    1],
         [   0,  564, 2787,  ...,    1,    1,    1]])}

In [28]:
# build RoBERTa config
config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)


# init model
model = RobertaForMaskedLM(config)
# set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)



In [29]:
# set number of epochs
epochs = 1

In [None]:
for epoch in range(epochs):
    input_ids = loader.dataset.encodings['input_ids'].to(device)
    attention_mask = loader.dataset.encodings['mask'].to(device)
    labels = loader.dataset.encodings['labels'].to(device)
    print('go')
    # model
    outputs = model(input_ids,
                    attention_mask=attention_mask,
                    labels=labels)
    # extract the loss
    loss = outputs.loss
    # calculate loss for every parameter that needs grad update
    loss.backward()
    # update parameters
    optim.step()
    # print relevant info to progress bar
    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())
        

go


In [61]:
#loop = tqdm(loader, leave=True)
# loop through epochs
for epoch in range(epochs):
    print(f'Epoch {str(epoch)}')
    for batch, feed_dict in tqdm.tqdm(enumerate(loader)):
        print(i_batch)
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        #print(batch.keys())
        input_ids = batch['input_ids'][0].to(device)
        attention_mask = batch['mask'][0].to(device)
        labels = batch['labels'][0].to(device)
        # model
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        # extract the loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        
model.save_pretrained(f'{output_dir}/thesis_bert')

Epoch 0


0it [00:00, ?it/s]

116





AttributeError: 'int' object has no attribute 'keys'

In [60]:
# loop through epochs
for epoch in range(epochs):
    print(f'Epoch {str(epoch)}')
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    print(loop)
    for batch in loop:
        print(batch.keys())
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        print(batch.keys())
        input_ids = batch['input_ids'][0].to(device)
        attention_mask = batch['mask'][0].to(device)
        labels = batch['labels'][0].to(device)
        # model
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        # extract the loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        
model.save_pretrained(f'{output_dir}/thesis_bert')

Epoch 0


TypeError: 'module' object is not callable

In [47]:
def train_model(config, loader):
    # init model
    model = RobertaForMaskedLM(config)
    # set device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # and move our model over to the selected device
    model.to(device)
    # activate training mode
    model.train()
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=1e-4)
    # set number of epochs
    epochs = 2
    # loop through epochs
    for epoch in range(epochs):
        print(f'Epoch {str(epoch)}')
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True)
        for batch in loop:
            print(batch.keys())
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            print(batch.keys())
            input_ids = batch['input_ids'][0].to(device)
            attention_mask = batch['mask'][0].to(device)
            labels = batch['labels'][0].to(device)
            # model
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            # extract the loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())

In [48]:

train_model(config, loader)

model.save_pretrained(f'{output_dir}/thesis_bert')

Epoch 0


  0%|                                                                                                                                                                                                           | 0/117 [00:00<?, ?it/s]

dict_keys(['input_ids'])
dict_keys(['input_ids'])





KeyError: 'mask'