## Install Everything You Need

In [None]:
!pip install tokenizers
!pip install transformers

## Load and Preprocess data

#### Helper Functions you might need



In [1]:
import regex as re
def basicPreprocess(text):
  try:
    processed_text = text.lower()
    processed_text = re.sub(r'\W +', ' ', processed_text)
  except Exception as e:
    print("Exception:",e,",on text:", text)
    return None
  return processed_text

### Convert CSV file to JSON

In [1]:
import csv
import json
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)

    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open('Lottery_Powerball_Winning_Numbers__Beginning_2010.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

# Specify the path to your CSV file
csv_file_path = 'Lottery_Powerball_Winning_Numbers__Beginning_2010.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
print(json_data)


Conversion completed. JSON data:
[
    {
        "Draw Date": "09/26/2020",
        "Winning Numbers": "11 21 27 36 62 24",
        "Multiplier": "3"
    },
    {
        "Draw Date": "09/30/2020",
        "Winning Numbers": "14 18 36 49 67 18",
        "Multiplier": "2"
    },
    {
        "Draw Date": "10/03/2020",
        "Winning Numbers": "18 31 36 43 47 20",
        "Multiplier": "2"
    },
    {
        "Draw Date": "10/07/2020",
        "Winning Numbers": "06 24 30 53 56 19",
        "Multiplier": "2"
    },
    {
        "Draw Date": "10/10/2020",
        "Winning Numbers": "05 18 23 40 50 18",
        "Multiplier": "3"
    },
    {
        "Draw Date": "10/14/2020",
        "Winning Numbers": "21 37 52 53 58 05",
        "Multiplier": "2"
    },
    {
        "Draw Date": "10/17/2020",
        "Winning Numbers": "06 10 31 37 44 23",
        "Multiplier": "2"
    },
    {
        "Draw Date": "10/21/2020",
        "Winning Numbers": "01 03 13 44 56 26",
        "Multiplier": 

### Convert JSON to Prompt List and llama input JSON

In [7]:

stats_file = "Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = []
test = []
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        powerball.append(
            {
            'instruction':f"what are the powerball numbers from {data['Draw Date']}",
            "output":data['Winning Numbers']
            }
        )
        lines.append(f"{data['Draw Date']} : {data['Winning Numbers']}\n")
    with open(f'./models/powerball/prompts.json', 'w', encoding='utf-8') as f:
        json.dump(powerball, f, ensure_ascii=True, indent=4, allow_nan=True)
        f.close()
    with open(f'./models/powerball/prompts.txt', 'w', encoding='utf-8') as f:
        f.writelines(lines)
        f.close()

## Train a custom tokenizer
I have used a ByteLevelBPETokenizer just to prevent \<unk> tokens entirely.
Furthermore, the function used to train the tokenizer assumes that each sample is stored in a different text file.

In [1]:
import torch
torch.cuda.is_available()

False

In [12]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path

# txt_files_dir = "./models/baseball/tokenizer/raw_3"

# paths = [str(x) for x in Path(txt_files_dir).glob("**/*.txt")]

tokenizer = Tokenizer(BPE())

from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=[    
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])
tokenizer.train(files=["./models/powerball/prompts.txt"], trainer=trainer)
tokenizer.save("./models/powerball/tokenizer/powerball.json")

output = tokenizer.encode("02/21/2018 : 07 15 31 34 36 08")
print(output.tokens)


['02', '/', '21', '/', '2018', ':', '07', '15', '31', '34', '36', '08']


## Train a Transformer Model

In [1]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=128,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [2]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="./models/powerball/tokenizer/powerball.json")

In [3]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [4]:
model.num_parameters()

43614848

In [5]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./models/powerball/prompts.txt",
    block_size=128,
)



In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [129]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models/powerball",
    overwrite_output_dir=True,
    num_train_epochs=15000,
    per_device_train_batch_size=512,
    save_steps=5000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [8]:
trainer.train()



  0%|          | 0/1800 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.3497, 'learning_rate': 3.611111111111111e-05, 'epoch': 83.33}
{'loss': 1.9311, 'learning_rate': 2.2222222222222223e-05, 'epoch': 166.67}
{'loss': 1.6631, 'learning_rate': 8.333333333333334e-06, 'epoch': 250.0}
{'train_runtime': 106.9524, 'train_samples_per_second': 4190.648, 'train_steps_per_second': 16.83, 'train_loss': 1.9081471252441407, 'epoch': 300.0}


TrainOutput(global_step=1800, training_loss=1.9081471252441407, metrics={'train_runtime': 106.9524, 'train_samples_per_second': 4190.648, 'train_steps_per_second': 16.83, 'train_loss': 1.9081471252441407, 'epoch': 300.0})

In [131]:
trainer.train(resume_from_checkpoint="./models/powerball/checkpoint-30000")

  0%|          | 0/45000 [00:00<?, ?it/s]

{'loss': 0.0428, 'learning_rate': 4.111111111111111e-05, 'epoch': 2666.67}
{'loss': 0.0398, 'learning_rate': 4.055555555555556e-05, 'epoch': 2833.33}
{'loss': 0.0372, 'learning_rate': 4e-05, 'epoch': 3000.0}
{'loss': 0.037, 'learning_rate': 3.944444444444445e-05, 'epoch': 3166.67}
{'loss': 0.0357, 'learning_rate': 3.888888888888889e-05, 'epoch': 3333.33}
{'loss': 0.0348, 'learning_rate': 3.8333333333333334e-05, 'epoch': 3500.0}
{'loss': 0.0322, 'learning_rate': 3.777777777777778e-05, 'epoch': 3666.67}
{'loss': 0.031, 'learning_rate': 3.722222222222222e-05, 'epoch': 3833.33}
{'loss': 0.03, 'learning_rate': 3.6666666666666666e-05, 'epoch': 4000.0}
{'loss': 0.0311, 'learning_rate': 3.611111111111111e-05, 'epoch': 4166.67}
{'loss': 0.0293, 'learning_rate': 3.555555555555556e-05, 'epoch': 4333.33}
{'loss': 0.0274, 'learning_rate': 3.5e-05, 'epoch': 4500.0}
{'loss': 0.0284, 'learning_rate': 3.444444444444445e-05, 'epoch': 4666.67}
{'loss': 0.0277, 'learning_rate': 3.388888888888889e-05, 'epo

In [37]:
trainer.save_model("./models/powerball")

In [85]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./models/powerball/checkpoint-30000",
    tokenizer=tokenizer,
    top_k=20,
)

fill_text = pipeline(
    "text-generation",
    model="./models/powerball/checkpoint-30000",
    tokenizer=tokenizer
)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [None]:
fill_mask("07/10/2023 : <mask> 24 34 53 58 13")

In [None]:
fill_mask("07/10/2023 : 02 24 34 53 58 <mask>")

In [None]:
fill_text("07/10/2023 : ", max_new_tokens=7)

In [None]:
fill_text("07/10/2023 : 02", max_new_tokens=6)

In [None]:
fill_text("07/10/2023 : 02 24", max_new_tokens=5)


In [None]:
fill_text("07/10/2023 : 02 24 34", max_new_tokens=4)


In [None]:
fill_text("07/10/2023 : 02 24 34 53", max_new_tokens=3)


In [None]:
fill_text("07/10/2023 : 02 24 34 53 58", max_new_tokens=2)