In [1]:
import huggingface_hub, wandb, os

token = "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"
huggingface_hub.login(token=token, add_to_git_credential=True)

os.environ["WANDB_NOTEBOOK_NAME"] = "train_KCroberta.py"
wandb.login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/sgolkar/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mgolkar[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
# loading the tokenizer and the tokenized dataset with numbers extracted
from transformers import PreTrainedTokenizerFast
from datasets import DatasetDict

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="toKCenizer_lorenz.json",
    bos_token="[END]",
    eos_token="[END]",
    mask_token="?",
    pad_token="[PAD]",
)

vocab_size = len(tokenizer.vocab)

ds_path = "/mnt/home/sgolkar/ceph/datasets/microcosm/lorenz_world_xsmall/clean/"
tokenized_ds = DatasetDict.load_from_disk(ds_path + "toKCenized_xslorenz_ds")

In [3]:
# defining the new collator type with numbers

from KCroberta import KC_mlm_collator

KC_coll = KC_mlm_collator(tokenizer=tokenizer, mlm_probability=0.2)

In [4]:
# defining the roberta derived model

from transformers import RobertaConfig
from KCroberta import KCRobertaForMaskedLM

hidden_size = 720

config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=1150,
    num_attention_heads=6,
    num_hidden_layers=12,
    type_vocab_size=2,
    hidden_size=hidden_size,
    intermediate_size=4 * hidden_size,
)

model = KCRobertaForMaskedLM(config=config, power_num=1 / 3)

print(hidden_size, f"{model.num_parameters():,}")

720 76,654,108


In [5]:
#  defining a small dataset for testing the model

train_size = 800_000
test_size = 5000

downsampled_dataset = tokenized_ds["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

Loading cached split indices for dataset at /mnt/home/sgolkar/ceph/datasets/microcosm/lorenz_world_xsmall/clean/toKCenized_xslorenz_ds/train/cache-c9d887003ea829c7.arrow and /mnt/home/sgolkar/ceph/datasets/microcosm/lorenz_world_xsmall/clean/toKCenized_xslorenz_ds/train/cache-0599c8ac82cfb824.arrow


In [6]:
# defining the trainer

from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./KCroberta_xslorenz",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_total_limit=2,
    evaluation_strategy="steps",
    save_steps=5000,
    eval_steps=5000,
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=0.00002,
    warmup_steps=2000,
    weight_decay=0.0001,
)


def compute_metrics(eval_preds):
    return {
        "loss_mlm": eval_preds[0][0].mean(),
        "loss_numbers": eval_preds[0][1].mean(),
    }


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=KC_coll,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [7]:
trainer.train()



You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
KC_coll = KC_mlm_collator(tokenizer=tokenizer, mlm_probability=0.2)
masked_sample = KC_coll([tokenized_ds["train"][3], tokenized_ds["train"][4]])
for key, val in masked_sample.items():
    masked_sample[key] = masked_sample[key].cuda()

(tensor(1.6320, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(1.1180, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.5140, device='cuda:0', grad_fn=<MseLossBackward0>))

In [None]:
model.train()
out = model(**masked_sample)
out.loss, out.loss_mlm, out.loss_numbers

(tensor(1.6390, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(1.1216, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.5173, device='cuda:0', grad_fn=<MseLossBackward0>))

In [None]:
model.eval()
out = model(**masked_sample)
out.loss, out.loss_mlm, out.loss_numbers

(tensor(1.6100, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(1.1152, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.4948, device='cuda:0', grad_fn=<MseLossBackward0>))

In [None]:
import copy, torch


num_token = tokenizer.encode('#')[0]
mask_token = tokenizer.encode('?')[0]

sample = tokenized_ds["val"][1]

masked_sample = copy.deepcopy(sample)
masked_sample['input_ids'][10] = mask_token
len_ = len(masked_sample['input_ids'])
masked_sample['masked_numbers'] = copy.deepcopy(sample['numbers'])[:len_]
masked_sample['masked_numbers'][10] = 1.0
ans = masked_sample['numbers'][10]
masked_sample['numbers'] = masked_sample['masked_numbers'][:len_]
masked_sample['labels'] = sample['input_ids']

for key, val in masked_sample.items():
    masked_sample[key] = torch.tensor(val).reshape(1, -1).cuda()
masked_sample['labels'] = 0*masked_sample['labels']-100
masked_sample['labels'][0,10] = sample['input_ids'][10]

out = model(**masked_sample)

print('predicted token: ', tokenizer.decode([out.logits[0,10].argmax().item()]))
print('masked token:', tokenizer.decode(masked_sample['input_ids'][0,10]))
print('actual token:', tokenizer.decode(sample['input_ids'][10]))
print()
print('predicted number: {:.2f}'.format(out.numbers[0,10].item()))
print('masked number: {:.2f}'.format(masked_sample['masked_numbers'][0,10].item()))
print('actual number: {:.2f}'.format(sample['numbers'][10]))

predicted token:  #
masked token: ?
actual token: #

predicted number: 0.64
masked number: 1.00
actual number: 12.13


In [None]:
# save the huggingface model via pickle 

torch.save(model, 'model_torch_save.pkl')

In [None]:
import pickle

with open("model_pickle_save.pkl", "wb") as fp:
    pickle.dump(model, fp)