In [1]:
from transformers import (
    RobertaForMaskedLM,
    RobertaConfig,
    PreTrainedTokenizerFast,
    DataCollatorForLanguageModeling,
)
from datasets import DatasetDict
# Loading the tokenizer

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_lorenz.json",
    bos_token="[END]",
    eos_token="[END]",
    mask_token="?",
    pad_token="[PAD]",
)

vocab_size = len(wrapped_tokenizer.vocab)

# Loading the saved tokenized dataset
path = "/mnt/home/sgolkar/ceph/datasets/microcosm/lorenz_world_xsmall/clean/"
tokenized_ds = DatasetDict.load_from_disk(path + "tokenized_ds")

In [2]:
# collating, padding and random masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=wrapped_tokenizer, mlm_probability=0.2
)

# an example of the output of the data_collator
samples = [tokenized_ds["train"][i] for i in range(1)]

for chunk in data_collator(samples)["input_ids"]:
    print(wrapped_tokenizer.decode(chunk))

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'description':{'num_sys':1,'sys0':{'params':[5?42,3.18,31.?],'init_point':[-7.555,0.4717,?.?],?700,'name':'lorenz','step_multip':4},'normalization':[[-9.12?,-14.95,14.?],[2.568,8.35,?.293]],'embedding':[[[0?964,-0.12445,0.2352]?[-0.?66,-0.9463,0.2644],[-0.1897??.29?,0.93?]],[1.0,1.0,1.0],?[-0.07?,0.01968,0.002796,?.549,-0.1758,0.?56,0?1421,0.1567],[0.0069,-0.181470.02716,-0.31?,-0?92?,0.?1493?0.0315,0?04797],[-0.2668,0.286,?0.?55?0.06305,?0?1399,0.?75,-0.3635,-?.7393]]?'sys3':?'data':[[-0.003998,0.34?,-0.001401,0.863,1.877?0.2966,0.4453,0.07??],[-0.003645,0.3186,?.0122,0?895,1.777,0.3828,0.075,?.11633],[-?.?3067,0.2966,0.02?,?.91?,1.68,0.?38,0.10?,0?1556]?[-?.006146,0.?5,0.03185?0.935,1?01???.52?,0.1252,0?1913],[-0.000?5?0?2537,?.04?8?0.944,1.49,?.574,0.1459,0.22?],75??001269,0.232,0.??5??.9453?1.?6,0.6177,0.1646,0?2512?,[0.00345,0.2112,0.06?6,0??,1.30?,0.6514,0.18],0.276173,[0.006035,0.1902,0.0748,0.928,?.21?,0??63,0.1941,0.298],[0.00894?0?1693,0.08276,?.91?1.125,0.?3,0.2057,0.317],[

In [3]:
config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=3000,
    num_attention_heads=6,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)
print(f"{model.num_parameters():,}")

45,534,862


In [4]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta_lorenz_xsmall",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=12,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
)

In [6]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = tokenized_ds["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta_lorenz_xsmall",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=downsampled_dataset['train'],
    eval_dataset=downsampled_dataset['test'],
)

In [13]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/125 [00:00<?, ?it/s]

>>> Perplexity: 28.46


In [17]:
trainer.train()

  0%|          | 0/1250 [00:00<?, ?it/s]

{'loss': 3.1438, 'learning_rate': 3e-05, 'epoch': 0.4}
{'loss': 3.0864, 'learning_rate': 1e-05, 'epoch': 0.8}
{'train_runtime': 1002.164, 'train_samples_per_second': 9.978, 'train_steps_per_second': 1.247, 'train_loss': 3.107066796875, 'epoch': 1.0}


TrainOutput(global_step=1250, training_loss=3.107066796875, metrics={'train_runtime': 1002.164, 'train_samples_per_second': 9.978, 'train_steps_per_second': 1.247, 'train_loss': 3.107066796875, 'epoch': 1.0})

In [18]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/125 [00:00<?, ?it/s]

>>> Perplexity: 21.64


In [19]:
# token = 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

from huggingface_hub import interpreter_login

interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credent

In [20]:
trainer.push_to_hub()

Cloning https://huggingface.co/sgolkar/roberta_lorenz_xsmall into local empty directory.


Upload file pytorch_model.bin:   0%|          | 1.00/174M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.50k [00:00<?, ?B/s]

To https://huggingface.co/sgolkar/roberta_lorenz_xsmall
   80a457d..22c55b5  main -> main

To https://huggingface.co/sgolkar/roberta_lorenz_xsmall
   22c55b5..cde419b  main -> main



'https://huggingface.co/sgolkar/roberta_lorenz_xsmall/commit/22c55b5a0ebfd8b51fe2bca87505e069d2a08375'

In [49]:
from transformers import pipeline

mask_filler = pipeline("fill-mask", model=model.cpu(), tokenizer=wrapped_tokenizer)

text = wrapped_tokenizer.decode(tokenized_ds['val'][0]['input_ids'])
split_text = text.split('11.41')
text_masked = split_text[0]+'?.41'+ split_text[1]

preds = mask_filler(text_masked)

for pred in preds:
    print(pred['score'])
    print(f">>> {pred['sequence']}")

0.45499104261398315
>>> {'description':{'num_sys':1,'sys0':{'params':[[.41,4.098,28.53],'init_point':[0.832,-1.072,-0.2062],'step_size':700,'name':'lorenz','step_multip':8},'normalization':[[-1.067,-2.506,0.1667],[1.635,3.184,0.6104]],'embedding':[[[0.1676,-0.947,0.2747],[-0.02942,0.2737,0.9614],[0.9854,0.1692,-0.018]],[1.0,1.0,1.0],[[0.2646,-0.1175,-0.3567,-0.323,0.06305,0.03506,0.445,0.5986,-0.3508],[-0.4243,-0.187,-0.745,0.1343,-0.3396,-0.2294,-0.1536,-0.1101,-0.0902],[-0.3552,-0.394,0.4697,-0.4148,-0.1289,-0.412,-0.1455,0.0866,-0.3325]]]},'data':[[0.0758,-0.0493,1.313,-0.3257,0.2418,-0.08154,-0.1326,-0.06696,-0.008675],[-0.007286,-0.09,1.229,-0.3147,0.1847,-0.1327,-0.1709,-0.09314,-0.02744],[-0.068,-0.1207,1.16,-0.3074,0.142,-0.1703,-0.1971,-0.10986,-0.04288],[-0.11237,-0.144,1.107,-0.3027,0.1101,-0.1982,-0.2148,-0.1198,-0.05585],[-0.144,-0.1613,1.063,-0.2993,0.08636,-0.2181,-0.2261,-0.1251,-0.06635],[-0.1675,-0.1744,1.027,-0.297,0.06824,-0.233,-0.2332,-0.1276,-0.075],[-0.1836,-0.1

In [51]:
text = wrapped_tokenizer.decode(tokenized_ds['val'][0]['input_ids'])
split_text = text.split("'num_sys':1")
text_masked = split_text[0]+"'num_sys':?"+ split_text[1]

preds = mask_filler(text_masked)

for pred in preds:
    print(pred['score'])
    print(f">>> {pred['sequence']}")

0.3684244751930237
>>> {'description':{'num_sys':1,'sys0':{'params':[11.41,4.098,28.53],'init_point':[0.832,-1.072,-0.2062],'step_size':700,'name':'lorenz','step_multip':8},'normalization':[[-1.067,-2.506,0.1667],[1.635,3.184,0.6104]],'embedding':[[[0.1676,-0.947,0.2747],[-0.02942,0.2737,0.9614],[0.9854,0.1692,-0.018]],[1.0,1.0,1.0],[[0.2646,-0.1175,-0.3567,-0.323,0.06305,0.03506,0.445,0.5986,-0.3508],[-0.4243,-0.187,-0.745,0.1343,-0.3396,-0.2294,-0.1536,-0.1101,-0.0902],[-0.3552,-0.394,0.4697,-0.4148,-0.1289,-0.412,-0.1455,0.0866,-0.3325]]]},'data':[[0.0758,-0.0493,1.313,-0.3257,0.2418,-0.08154,-0.1326,-0.06696,-0.008675],[-0.007286,-0.09,1.229,-0.3147,0.1847,-0.1327,-0.1709,-0.09314,-0.02744],[-0.068,-0.1207,1.16,-0.3074,0.142,-0.1703,-0.1971,-0.10986,-0.04288],[-0.11237,-0.144,1.107,-0.3027,0.1101,-0.1982,-0.2148,-0.1198,-0.05585],[-0.144,-0.1613,1.063,-0.2993,0.08636,-0.2181,-0.2261,-0.1251,-0.06635],[-0.1675,-0.1744,1.027,-0.297,0.06824,-0.233,-0.2332,-0.1276,-0.075],[-0.1836,-0.1

In [56]:
torch.version.cuda

'11.7'