In [1]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-96142f87-45d0-6f7f-daed-4e024801cf48)


In [2]:
%%capture install_log 

!pip install transformers datasets

In [3]:
from transformers import (
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizerFast,
    RobertaConfig,
    Trainer,
    TrainingArguments,
)
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from datasets import load_dataset

In [4]:
raw_data = load_dataset('gngpostalsrvc/Tanakh')
texts = list(raw_data['train']['Text'])
texts.extend(list(raw_data['test']['Text']))

Downloading metadata:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading and preparing dataset csv/default (download: 2.22 MiB, generated: 4.51 MiB, post-processed: Unknown size, total: 6.73 MiB) to /root/.cache/huggingface/datasets/gngpostalsrvc___parquet/gngpostalsrvc--Tanakh-2f712127392b4242/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/237k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/2295 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20651 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/gngpostalsrvc___parquet/gngpostalsrvc--Tanakh-2f712127392b4242/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(vocab_size = 2000, special_tokens = ["<s>", "<pad>",  "</s>", "<unk>", "<mask>"])

tokenizer.post_processor = TemplateProcessing(
    single = "<s> $A </s>",
    special_tokens = [("<s>", 0), ("</s>", 2)])

tokenizer.train_from_iterator(texts, trainer)

derivational_morphs = ['הִתְ' ,'מוֹ' ,'מְ' ,'מַ' ,'הֵ' ,'הִ' ,'הֶ' ,'הֲ' ,'הוֹ' ,' ַת' ,' ֵי' ,' ָה' ,'וֹת' ,' ִים' ,' ִי' ,' ַי' ,'נִי' ,'ךָ' ,'ךְ' ,'וֹ' ,'ו' ,'הוּ' ,'נּוּ' ,' ָהּ' ,'הָ' ,'נָּה' ,'נוּ' ,'כֶם' ,'כֶן' ,'הֶם' ,' ָם' ,' ֵם' ,'ם' ,'הֵנָּה' ,'הֶן' ,' ֵן' ,' ָן' ,' ֵן' ,'נִתְ' ,'נִּתְ' ,'יִתְ' ,'יִּתְ' ,'אֶתְ' ,'תִּתְ' ,'תִתְ' ,'תּוֹ' ,'תוֹ' ,'אוֹ' ,'הוֹ' ,'נוֹ' ,'נּוֹ' ,'יוֹ' ,'יּוֹ' ,'אֲ' ,'אַ' ,'אֹ' ,'אֶ' ,'אִ' ,'אָ' ,'אֵ' ,'תֵּ' ,'תַּ' ,'תִּ' ,'תָּ' ,'תְּ' ,'תֹּ' ,'תֶּ' ,'תֵ' ,'תַ' ,'תִ' ,'תָ' ,'תְ' ,'תֹ' ,'תֶ' ,'יָ' ,'יִ' ,'יֶ' ,'יֹ' ,'יְ' ,'יַ' ,'יֵ' ,'יָּ' ,'יִּ' ,'יֶּ' ,'יֹּ' ,'יַּ' ,'יֵּ' ,'נֹ' ,'נָ' ,'נֵ' ,'נִ' ,'נֶ' ,'נַ' ,'נְ' ,'נֹּ' ,'נָּ' ,'נֵּ' ,'נִּ' ,'נֶּ' ,'נַּ' ,'וּ' ,'נָה' ,'תִּי' ,'תֶּם' ,'תֶּן' ,'תִי' ,'תֶם' ,'תֶן']

tokenizer.add_tokens(derivational_morphs)

tokenizer.save('BERiT_tokenizer_2000_enriched.json')

tokenizer = PreTrainedTokenizerFast(tokenizer_file = 'BERiT_tokenizer_2000_enriched.json')

tokenizer.add_special_tokens({'pad_token' : '<pad>', 'mask_token' : '<mask>', 'unk_token' : '<unk>', 'bos_token' : '<s>', 'eos_token' : '</s>'})

tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=.15)

def tokenize(sentence):
  return tokenizer(sentence['Text'], max_length=128, truncation=True, padding=True)

tokenized_data = raw_data.map(tokenize, batched=True, remove_columns=raw_data['train'].column_names)

tokenized_data.set_format("pt", columns=["input_ids", "attention_mask"], output_all_columns=True)


Map:   0%|          | 0/2295 [00:00<?, ? examples/s]

Map:   0%|          | 0/20651 [00:00<?, ? examples/s]

In [None]:
config = RobertaConfig.from_pretrained(
    "roberta-base", 
    model_type='roberta',
    attention_probs_dropout_prob=.5, 
    hidden_dropout_prob=.5, 
    hidden_size=256,
    intermediate_size=1024,
    max_position_embeddings=128, 
    num_attention_heads=4,
    num_hidden_layers=1,
    vocab_size=len(tokenizer.vocab)
    )

model = AutoModelForMaskedLM.from_pretrained("roberta-base", config=config, ignore_mismatched_sizes=True)

args = TrainingArguments(output_dir="BERiT_2000_custom_architecture_150_epochs_2", 
                         evaluation_strategy="steps",
                         save_strategy="epoch",
                         learning_rate=0.006543967976815992,
                         weight_decay=0.003608676285736417,
                         num_train_epochs=150,
                         per_device_train_batch_size=8, 
                         per_device_eval_batch_size=8,
                         hub_token='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',
                         push_to_hub=True,
                         seed=42,
                        )
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
  )

trainer.train()

trainer.push_to_hub()

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMaskedLM: ['roberta.encoder.layer.1.intermediate.dense.weight', 'roberta.encoder.layer.11.attention.self.query.bias', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.4.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.dense.bias', 'roberta.encoder.layer.7.attention.self.value.bias', 'roberta.encoder.layer.6.attention.self.key.bias', 'roberta.encoder.layer.11.attention.self.query.weight', 'roberta.encoder.layer.7.attention.output.LayerNorm.weight', 'roberta.encoder.layer.10.attention.self.value.weight', 'roberta.encoder.layer.2.attention.self.value.weight', 'roberta.encoder.layer.1.attention.self.query.weight', 'roberta.encoder.layer.5.attention.self.query.bias', 'roberta.encoder.layer.2.intermediate.dense.bias', 'roberta.encoder.layer.5.attention.self.key.bias', 'roberta.encoder.layer.5.attention.outp

Step,Training Loss,Validation Loss
500,9.3676,6.868522
1000,6.8296,6.81993
1500,6.7437,6.724422
2000,6.7472,6.726513
2500,6.7285,6.713015
3000,6.7129,6.675064
3500,6.6973,6.667241
4000,6.7048,6.678175
4500,6.7122,6.717774
5000,6.6792,6.675213
