## Install Everything You Need

In [None]:
!pip install tokenizers
!pip install transformers

## Load and Preprocess data

#### Helper Functions you might need



In [1]:
import regex as re
def basicPreprocess(text):
  try:
    processed_text = text.lower()
    processed_text = re.sub(r'\W +', ' ', processed_text)
  except Exception as e:
    print("Exception:",e,",on text:", text)
    return None
  return processed_text

### Convert CSV file to JSON

In [None]:
import csv
import json
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)

    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open('Lottery_Powerball_Winning_Numbers__Beginning_2010.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

# Specify the path to your CSV file
csv_file_path = 'Lottery_Powerball_Winning_Numbers__Beginning_2010.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
print(json_data)


### Convert JSON to Prompt List and llama input JSON

In [None]:

stats_file = "Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = []
test = []
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        powerball.append(
            {
            'instruction':f"what is the powerball drawing on {data['Draw Date']}",
            "output":data['Winning Numbers']
            }
        )
        lines.append(f"###instruction: what is the powerball drawing on {data['Draw Date']}, ###output: {data['Winning Numbers']}\n")
    with open(f'prompts.json', 'w', encoding='utf-8') as f:
        json.dump(powerball, f, ensure_ascii=True, indent=4, allow_nan=True)
        f.close()
    with open(f'prompts.txt', 'w', encoding='utf-8') as f:
        f.writelines(lines)
        f.close()

## Train a custom tokenizer
I have used a ByteLevelBPETokenizer just to prevent \<unk> tokens entirely.
Furthermore, the function used to train the tokenizer assumes that each sample is stored in a different text file.

In [3]:
import torch
torch.cuda.is_available()

True

In [17]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path

# txt_files_dir = "./models/baseball/tokenizer/raw_3"

# paths = [str(x) for x in Path(txt_files_dir).glob("**/*.txt")]

tokenizer = Tokenizer(BPE())

from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=[    
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])
tokenizer.train(files=["./models/powerball/prompts.txt"], trainer=trainer)
tokenizer.save("./models/powerball/tokenizer/powerball.json")

output = tokenizer.encode("")
print(output.tokens)


['###', 'INSTRUCTION', ':', 'what', 'is', 'the', 'outcome', 'of', 'pitcher', '477132', 'pitching', 'to', 'batter', '593428', '###', 'INPUT', ':', 'T', 'o', 'p', 'of', 'the', '4', 'in', 'n', 'ing', 'w', 'i', 'th', '1', 'stri', 'k', 'e', 'and', '1', 'ball', 's', 'and', '1', 'out', 's', '###', 'RESPONSE', ':']


## Train a Transformer Model

In [19]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [20]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="./models/powerball/tokenizer/powerball.json")

In [21]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [22]:
model.num_parameters()

83504416

In [9]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./models/powerball/prompts.txt",
    block_size=128,
)



In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models/powerball",
    overwrite_output_dir=True,
    num_train_epochs=300,
    per_device_train_batch_size=256,
    save_steps=5_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()
trainer.save_model("./models/powerball")

In [None]:
trainer.train(resume_from_checkpoint="./models/powerball")
trainer.save_model("./models/powerball")

In [13]:
trainer.save_model("./models/powerball")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./models/powerball",
    tokenizer="./models/powerball/tokenizer",
    top_k=10,
)

fill_text = pipeline(
    "text-generation",
    model="./models/powerball",
    tokenizer="./models/powerball/tokenizer",
)