## Install Everything You Need

In [None]:
%pip install tokenizers
%pip install transformers
%pip install datasets --upgrade

## Train a custom tokenizer


In [1]:
import torch
torch.cuda.is_available()

True

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(BPE(unk_token="<unk>"))

tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=[    
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])

tokenizer.train(files=["./twenty_years_of_baseball_2.txt"], trainer=trainer)
tokenizer.save("./models/mlb_short/tokenizer/tokenizer.json")

output = tokenizer.encode("what is the outcome of pitcher 461833 pitching to batter 435079")
print(output.tokens)


['what', 'is', 'the', 'outcome', 'of', 'pitcher', '461833', 'pitching', 'to', 'batter', '435079']


In [5]:
tokenizer.get_vocab_size()

11481

## Train a Transformer Model

In [2]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=11481,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [3]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="./models/mlb/tokenizer/baseball.json")

In [4]:
from transformers import RobertaForCausalLM, RobertaForMaskedLM

model = RobertaForCausalLM(config=config)
# model = RobertaForMaskedLM(config=config)

bin c:\Users\NVIDIA\Downloads\text-generation-webui-1.6\text-generation-webui-1.6\installer_files\env\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [5]:
model.num_parameters()

52345305

In [120]:
from datasets import load_dataset
data_file = {"train":"./baseball_23_03.txt"}
# dataset = load_dataset("json", data_files=data_file)
dataset = load_dataset("text", data_files=data_file)
# raw_datasets = load_dataset(
#             "json",
#             data_files=data_file,
#         )

# raw_datasets = {}
# raw_datasets["validation"] = load_dataset(
#     "json",
#     data_files=data_file,
#     split=f"train[:{90}%]",
# )
# raw_datasets["train"] = load_dataset(
#     "json",
#     data_files=data_file,
#     split=f"train[:{10}%]",
# )


In [121]:
print(len(dataset["train"]))


3678821


In [6]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./baseball_23_03.txt",
    block_size=128,
)



In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
import torch
torch.cuda.is_available()

True

In [8]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    tf32=True,
    output_dir="./models/mlb",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=512,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()

In [None]:
trainer.train(resume_from_checkpoint="./models/mlb/checkpoint-503500")

In [9]:
trainer.save_model("./models/mlb/")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./models/mlb/checkpoint-498000",
    tokenizer=tokenizer,
    top_k=20,
)

fill_text = pipeline(
    "text-generation",
    model="./models/mlb/checkpoint-498000",
    tokenizer=tokenizer,
)

In [None]:
https://huggingface.co/blog/how-to-generate

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

torch_device = "cuda" if torch.cuda.is_available() else "cpu"


# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="./models/mlb/checkpoint-498000", pad_token_id=tokenizer.eos_token_id).to(torch_device)

In [None]:
# encode context the generation is conditioned on
model_inputs = tokenizer(f'<s> ###instruction: what is the outcome of pitcher 663432 pitching to batter 596115 ###input: Top of the {math.floor(random.randrange(1, 10))} inning with {math.floor(random.randrange(0, 3))} outs ###output: 663432 throws a"', return_tensors='pt').to(torch_device)

# generate 40 new tokens
greedy_output = model.generate(**model_inputs, max_new_tokens=160)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

In [None]:
# activate beam search and early_stopping
beam_output = model.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
# set no_repeat_ngram_size to 2
beam_output = model.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    no_repeat_ngram_size=5,
    num_return_sequences=5,
    early_stopping=True,
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
from transformers import set_seed
set_seed(42)
for i in range(0, 15):
    model_inputs = tokenizer(f'<s> ###instruction: what is the outcome of pitcher 663432 pitching to batter 596115 ###input: Top of the {math.floor(random.randrange(1, 10))} inning with {math.floor(random.randrange(0, 3))} outs ###output: 663432 throws a"', return_tensors='pt').to(torch_device)

    # activate sampling and deactivate top_k by setting top_k sampling to 0
    sample_output = model.generate(
        **model_inputs,
        max_new_tokens=40,
        do_sample=True,
        top_k=100,
        temperature=0.6,
    )

    print("Output:\n" + 100 * '-')
    print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
set_seed(42)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=0,
    temperature=0.6,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_p=0.92,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [14]:
import random
import math

In [None]:
for i in range(0, 50):
    text = fill_text(f"<s> ###instruction: what is the outcome of pitcher 663432 pitching to batter 596115 ###input: Top of the {math.floor(random.randrange(1, 10))} inning with {math.floor(random.randrange(0, 3))} outs ###output: 663432 throws a", max_new_tokens=25, top_k=10, temperature=1.25, do_sample=True, epsilon_cutoff=9e-4, encoder_repetition_penalty=0.1)
    for t in text:
        print(t)