## Install Everything You Need

In [None]:
%pip install tokenizers
%pip install transformers
%pip install datasets --upgrade

## Train a custom tokenizer


In [1]:
import torch
torch.cuda.is_available()

True

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(BPE(unk_token="<unk>"))

tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=[    
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])

tokenizer.train(files=["twenty_years_of_baseball_structed_data.txt"], trainer=trainer)
tokenizer.save("../models/mlb_structured/tokenizer.json")

print(len('{"input": {"pitcher": "jered weaver", "batter": "alcides escobar", "p_throws": "R", "stand": "R", "inning_topbot": "Bot", "inning": 5, "outs_when_up": 1, "on_1b": "", "on_2b": "", "on_3b": "", "home_score": 0, "away_score": 2}, "result": {"event": "field_out", "type": "X", "zone": 14, "des": "Alcides Escobar grounds out softly, third baseman Maicer Izturis to first baseman Mark Trumbo.", "at_bat_number": 40, "pitch_number": 5, "pitch_name": "Slider", "hit_location": 5, "launch_speed": "", "launch_speed_angle": "", "runs_scored": 0, "at_bat": ["called_strike", "called_strike", "foul", "ball", "hit_into_play"], "pitch_type": ["FF", "FF", "SI", "SI", "SL"], "release_speed": [88.3, 90.2, 90.2, 88.6, 80.5]}}'))
# output = tokenizer.encode()
# print(output.tokens)


712


In [4]:
tokenizer.get_vocab_size()

11491

## Train a Transformer Model

In [1]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=11491,
    max_position_embeddings=1024,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [5]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="../models/mlb_structured/tokenizer.json")

In [6]:
from transformers import RobertaForCausalLM, RobertaForMaskedLM

model = RobertaForCausalLM(config=config)
# model = RobertaForMaskedLM(config=config)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [3]:
model.num_parameters()

52744675

In [7]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./twenty_years_of_baseball_structed_data.txt",
    block_size=128,
)



In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [9]:
import torch
torch.cuda.is_available()

True

In [10]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    tf32=True,
    output_dir="../models/mlb_structured",
    overwrite_output_dir=True,
    num_train_epochs=500,
    per_device_train_batch_size=512,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [16]:
trainer.train()



  0%|          | 0/15570000 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.3084, 'learning_rate': 4.999839434810533e-05, 'epoch': 0.32}
{'loss': 0.5488, 'learning_rate': 4.999678869621066e-05, 'epoch': 0.64}
{'loss': 0.4676, 'learning_rate': 4.9995183044316e-05, 'epoch': 0.96}
{'loss': 0.3985, 'learning_rate': 4.9993577392421324e-05, 'epoch': 1.28}
{'loss': 0.2833, 'learning_rate': 4.999197174052666e-05, 'epoch': 1.61}
{'loss': 0.2373, 'learning_rate': 4.9990366088631985e-05, 'epoch': 1.93}


In [14]:
trainer.train(resume_from_checkpoint="../models/mlb_structured/checkpoint-26500")

  0%|          | 0/778500 [00:00<?, ?it/s]

{'loss': 0.1108, 'learning_rate': 4.826589595375723e-05, 'epoch': 17.34}
{'loss': 0.1107, 'learning_rate': 4.8233782915863844e-05, 'epoch': 17.66}
{'loss': 0.1109, 'learning_rate': 4.820166987797046e-05, 'epoch': 17.98}
{'loss': 0.1101, 'learning_rate': 4.816955684007707e-05, 'epoch': 18.3}
{'loss': 0.1103, 'learning_rate': 4.813744380218369e-05, 'epoch': 18.63}
{'loss': 0.1096, 'learning_rate': 4.81053307642903e-05, 'epoch': 18.95}
{'loss': 0.1092, 'learning_rate': 4.807321772639692e-05, 'epoch': 19.27}
{'loss': 0.1089, 'learning_rate': 4.8041104688503536e-05, 'epoch': 19.59}
{'loss': 0.1088, 'learning_rate': 4.800899165061015e-05, 'epoch': 19.91}
{'loss': 0.1091, 'learning_rate': 4.7976878612716764e-05, 'epoch': 20.23}
{'loss': 0.1085, 'learning_rate': 4.794476557482338e-05, 'epoch': 20.55}
{'loss': 0.1082, 'learning_rate': 4.791265253693e-05, 'epoch': 20.87}
{'loss': 0.1079, 'learning_rate': 4.7880539499036607e-05, 'epoch': 21.19}
{'loss': 0.1077, 'learning_rate': 4.784842646114323e

KeyboardInterrupt: 

In [9]:
trainer.save_model("./models/mlb/")