## Install Everything You Need

In [None]:
!pip install tokenizers
!pip install transformers

## Load and Preprocess data

#### Helper Functions you might need



In [1]:
import regex as re
def basicPreprocess(text):
  try:
    processed_text = text.lower()
    processed_text = re.sub(r'\W +', ' ', processed_text)
  except Exception as e:
    print("Exception:",e,",on text:", text)
    return None
  return processed_text

### Convert CSV file to JSON

In [89]:
import csv
import json
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)

    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open('07-12-08-10.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

# Specify the path to your CSV file
csv_file_path = './data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010-08-10-2023.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
print(json_data)


Conversion completed. JSON data:
[
    {
        "Draw Date": "08/09/2023",
        "Winning Numbers": "10 15 21 67 69 03",
        "Multiplier": "2"
    },
    {
        "Draw Date": "08/07/2023",
        "Winning Numbers": "06 13 20 35 54 22",
        "Multiplier": "2"
    },
    {
        "Draw Date": "08/05/2023",
        "Winning Numbers": "18 42 44 62 65 23",
        "Multiplier": "2"
    },
    {
        "Draw Date": "08/02/2023",
        "Winning Numbers": "23 24 33 51 64 05",
        "Multiplier": "2"
    },
    {
        "Draw Date": "07/31/2023",
        "Winning Numbers": "02 11 48 58 65 13",
        "Multiplier": "2"
    },
    {
        "Draw Date": "07/29/2023",
        "Winning Numbers": "10 25 27 34 38 02",
        "Multiplier": "3"
    },
    {
        "Draw Date": "07/26/2023",
        "Winning Numbers": "03 16 40 48 60 14",
        "Multiplier": "2"
    },
    {
        "Draw Date": "07/24/2023",
        "Winning Numbers": "03 04 12 28 49 25",
        "Multiplier": 

### Convert JSON to Prompt List and llama input JSON

In [273]:
from dateutil.parser import parse
from datetime import datetime
import json
stats_file = "./data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = {}
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        dt = parse(data['Draw Date'])
        powerball[dt] = data['Winning Numbers']

res = {key: val for key, val in sorted(powerball.items(), key = lambda ele: ele[0])}
print(res)

pre_numbers = None
for dt in powerball:
    if pre_numbers is not None:
        lines.append(f"{pre_numbers} : {powerball[dt]}\n")
    pre_numbers = powerball[dt]
with open(f'./models/powerball2/prompts.txt', 'w', encoding='utf-8') as f:
    f.writelines(lines)
    f.close()

{datetime.datetime(2010, 2, 3, 0, 0): '17 22 36 37 52 24', datetime.datetime(2010, 2, 6, 0, 0): '14 22 52 54 59 04', datetime.datetime(2010, 2, 10, 0, 0): '05 08 29 37 38 34', datetime.datetime(2010, 2, 13, 0, 0): '10 14 30 40 51 01', datetime.datetime(2010, 2, 17, 0, 0): '07 08 19 26 36 15', datetime.datetime(2010, 2, 20, 0, 0): '13 27 37 41 54 32', datetime.datetime(2010, 2, 24, 0, 0): '04 17 35 50 57 12', datetime.datetime(2010, 2, 27, 0, 0): '18 47 51 53 58 30', datetime.datetime(2010, 3, 3, 0, 0): '07 09 14 45 49 23', datetime.datetime(2010, 3, 6, 0, 0): '10 29 33 41 59 15', datetime.datetime(2010, 3, 10, 0, 0): '17 21 37 41 50 01', datetime.datetime(2010, 3, 13, 0, 0): '06 16 20 31 36 08', datetime.datetime(2010, 3, 17, 0, 0): '24 26 45 48 55 08', datetime.datetime(2010, 3, 20, 0, 0): '09 36 39 44 45 09', datetime.datetime(2010, 3, 24, 0, 0): '14 20 24 39 49 07', datetime.datetime(2010, 3, 27, 0, 0): '07 21 32 44 52 10', datetime.datetime(2010, 3, 31, 0, 0): '05 13 17 45 54 12', 

In [7]:

stats_file = "./data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = []
test = []
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        powerball.append(
            {
            'instruction':f"what are the powerball numbers from {data['Draw Date']}",
            "output":data['Winning Numbers']
            }
        )
        lines.append(f"{data['Draw Date']} : {data['Winning Numbers']}\n")
    with open(f'./models/powerball/prompts.json', 'w', encoding='utf-8') as f:
        json.dump(powerball, f, ensure_ascii=True, indent=4, allow_nan=True)
        f.close()
    with open(f'./models/powerball/prompts.txt', 'w', encoding='utf-8') as f:
        f.writelines(lines)
        f.close()

## Train a custom tokenizer
I have used a ByteLevelBPETokenizer just to prevent \<unk> tokens entirely.
Furthermore, the function used to train the tokenizer assumes that each sample is stored in a different text file.

In [1]:
import torch
torch.cuda.is_available()

True

In [277]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path

# txt_files_dir = "./models/baseball/tokenizer/raw_3"

# paths = [str(x) for x in Path(txt_files_dir).glob("**/*.txt")]

tokenizer = Tokenizer(BPE())

from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=[    
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])
tokenizer.train(files=["./models/powerball/prompts.txt"], trainer=trainer)
tokenizer.save("./models/powerball/tokenizer/powerball.json")

output = tokenizer.encode("02 04 54 61 62 14 : 15 45 64 67 68 18")
print(output.tokens)


['02', '04', '54', '61', '62', '14', ':', '15', '45', '64', '67', '68', '18']


## Train a Transformer Model

In [1]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=128,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [2]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="./models/powerball/tokenizer/powerball.json")

In [3]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [4]:
model.num_parameters()

43614848

In [5]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./models/powerball2/prompts.txt",
    block_size=128,
)



In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models/powerball2",
    overwrite_output_dir=True,
    num_train_epochs=30000,
    per_device_train_batch_size=768,
    save_steps=1000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [9]:
trainer.train()



  0%|          | 0/90000 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.9996, 'learning_rate': 4.972222222222223e-05, 'epoch': 83.33}
{'loss': 2.1674, 'learning_rate': 4.9444444444444446e-05, 'epoch': 166.67}
{'loss': 1.2984, 'learning_rate': 4.9166666666666665e-05, 'epoch': 250.0}
{'loss': 0.6697, 'learning_rate': 4.888888888888889e-05, 'epoch': 333.33}
{'loss': 0.3969, 'learning_rate': 4.8611111111111115e-05, 'epoch': 416.67}
{'loss': 0.2755, 'learning_rate': 4.8333333333333334e-05, 'epoch': 500.0}
{'loss': 0.2156, 'learning_rate': 4.805555555555556e-05, 'epoch': 583.33}
{'loss': 0.1762, 'learning_rate': 4.7777777777777784e-05, 'epoch': 666.67}
{'loss': 0.1528, 'learning_rate': 4.75e-05, 'epoch': 750.0}
{'loss': 0.1308, 'learning_rate': 4.722222222222222e-05, 'epoch': 833.33}
{'loss': 0.1176, 'learning_rate': 4.6944444444444446e-05, 'epoch': 916.67}
{'loss': 0.1039, 'learning_rate': 4.666666666666667e-05, 'epoch': 1000.0}
{'loss': 0.0971, 'learning_rate': 4.638888888888889e-05, 'epoch': 1083.33}
{'loss': 0.0884, 'learning_rate': 4.611111111111

TrainOutput(global_step=90000, training_loss=0.06566297795772552, metrics={'train_runtime': 5734.1458, 'train_samples_per_second': 3905.551, 'train_steps_per_second': 15.695, 'train_loss': 0.06566297795772552, 'epoch': 15000.0})

In [8]:
trainer.train(resume_from_checkpoint="./models/powerball2")



  0%|          | 0/60000 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.008, 'learning_rate': 4.958333333333334e-05, 'epoch': 250.0}
{'loss': 0.0101, 'learning_rate': 4.9166666666666665e-05, 'epoch': 500.0}
{'loss': 0.0097, 'learning_rate': 4.875e-05, 'epoch': 750.0}
{'loss': 0.01, 'learning_rate': 4.8333333333333334e-05, 'epoch': 1000.0}
{'loss': 0.0099, 'learning_rate': 4.791666666666667e-05, 'epoch': 1250.0}
{'loss': 0.009, 'learning_rate': 4.75e-05, 'epoch': 1500.0}
{'loss': 0.009, 'learning_rate': 4.708333333333334e-05, 'epoch': 1750.0}
{'loss': 0.0094, 'learning_rate': 4.666666666666667e-05, 'epoch': 2000.0}
{'loss': 0.0088, 'learning_rate': 4.6250000000000006e-05, 'epoch': 2250.0}
{'loss': 0.0082, 'learning_rate': 4.5833333333333334e-05, 'epoch': 2500.0}
{'loss': 0.0083, 'learning_rate': 4.541666666666667e-05, 'epoch': 2750.0}
{'loss': 0.0087, 'learning_rate': 4.5e-05, 'epoch': 3000.0}
{'loss': 0.0085, 'learning_rate': 4.458333333333334e-05, 'epoch': 3250.0}
{'loss': 0.0085, 'learning_rate': 4.4166666666666665e-05, 'epoch': 3500.0}
{'loss

TrainOutput(global_step=60000, training_loss=0.004855013785759608, metrics={'train_runtime': 13436.8858, 'train_samples_per_second': 3333.362, 'train_steps_per_second': 4.465, 'train_loss': 0.004855013785759608, 'epoch': 30000.0})

In [10]:
trainer.save_model("./models/powerball2/")

In [10]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./models/powerball2/checkpoint-60000/",
    tokenizer=tokenizer,
    top_k=20,
)

fill_text = pipeline(
    "text-generation",
    model="./models/powerball2/checkpoint-60000/",
    tokenizer=tokenizer
)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [11]:
fill_mask("03 09 21 24 29 14 : 13 20 <mask> 33 59 20") #03 09 21 24 29 14 : 13 20 31 33 59 20

[{'score': 1.0,
  'token': 53,
  'token_str': '31',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 20'},
 {'score': 4.1560990204914106e-09,
  'token': 22,
  'token_str': '20',
  'sequence': '03 09 21 24 29 14 : 13 20 20 33 59 20'},
 {'score': 1.878857736059558e-09,
  'token': 54,
  'token_str': '41',
  'sequence': '03 09 21 24 29 14 : 13 20 41 33 59 20'},
 {'score': 7.333026963429745e-10,
  'token': 68,
  'token_str': '34',
  'sequence': '03 09 21 24 29 14 : 13 20 34 33 59 20'},
 {'score': 3.692351702166974e-10,
  'token': 37,
  'token_str': '13',
  'sequence': '03 09 21 24 29 14 : 13 20 13 33 59 20'},
 {'score': 1.3290527012266296e-10,
  'token': 24,
  'token_str': '21',
  'sequence': '03 09 21 24 29 14 : 13 20 21 33 59 20'},
 {'score': 1.1891779228001553e-10,
  'token': 16,
  'token_str': '23',
  'sequence': '03 09 21 24 29 14 : 13 20 23 33 59 20'},
 {'score': 1.185737619202598e-10,
  'token': 58,
  'token_str': '38',
  'sequence': '03 09 21 24 29 14 : 13 20 38 33 59 20'},
 {'scor

In [12]:
fill_mask("03 09 21 24 29 14 : 13 20 31 33 59 <mask>")

[{'score': 1.0,
  'token': 22,
  'token_str': '20',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 20'},
 {'score': 3.03060826434276e-10,
  'token': 19,
  'token_str': '14',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 14'},
 {'score': 2.696455836392886e-10,
  'token': 17,
  'token_str': '11',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 11'},
 {'score': 2.1602752919847035e-10,
  'token': 39,
  'token_str': '25',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 25'},
 {'score': 1.8838196280679398e-10,
  'token': 41,
  'token_str': '09',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 09'},
 {'score': 1.2831595508355775e-10,
  'token': 70,
  'token_str': '50',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 50'},
 {'score': 5.391965346435157e-11,
  'token': 53,
  'token_str': '31',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 31'},
 {'score': 5.096429181450368e-11,
  'token': 37,
  'token_str': '13',
  'sequence': '03 09 21 24 29 14 : 13 20 31 33 59 13'},
 {'score

In [13]:
fill_text("03 09 21 24 29 14 ::", max_new_tokens=6)

[{'generated_text': '03 09 21 24 29 14 :: 13 20 31 33 59 20'}]

In [15]:
fill_text("03 09 21 24 29 14 :: 13 20", max_new_tokens=4)


[{'generated_text': '03 09 21 24 29 14 :: 13 20 31 33 59 20'}]

In [124]:
fill_text("03 09 21 24 29 14 :: 13 20 31", max_new_tokens=4)


[{'generated_text': '03 09 21 24 29 14 :: 13 20 31 31 33 20 20'}]

In [123]:
fill_text("03 09 21 24 29 14 :: 13 20 31 33", max_new_tokens=3)


[{'generated_text': '03 09 21 24 29 14 :: 13 20 31 33 33 20 20'}]

In [17]:
fill_text("03 09 21 24 29 14 :: 13 20 31 33 59", max_new_tokens=1)

[{'generated_text': '03 09 21 24 29 14 :: 13 20 31 33 59 20'}]

In [38]:
import json
f = open("./07-12-08-10.json")
winner = "10 15 21 67 69 03"
winner = winner.split(" ")
numbers = json.load(f)
for drawing in numbers:
    winning_numbers = drawing["Winning Numbers"]
    prompt = f"{winning_numbers} ::"
    winning_picks = []
    for i in range(0, 1000):
        prediction = fill_text(prompt, max_new_tokens=6, do_sample=True, temperature=1.7)[0]["generated_text"]
        prediction = prediction.replace(prompt, "")
        count = 0
        for w in winner:
            if w in prediction:
                count+=1
                if count > 1:
                    if prediction not in winning_picks:
                        winning_picks.append(prediction)
                    print(f"{prediction} {count} percetange {len(winning_picks)/1000}")

 21 21 21 24 69 16 2 percetange 0.001
 05 26 67 54 69 16 2 percetange 0.002
 10 15 15 54 58 01 2 percetange 0.003
 21 59 25 28 15 01 2 percetange 0.004
 10 21 21 51 32 14 2 percetange 0.005
 03 21 21 24 08 14 2 percetange 0.006
 13 14 10 52 52 21 2 percetange 0.007
 07 21 21 41 69 16 2 percetange 0.008
 21 21 39 16 69 23 2 percetange 0.009
 05 10 10 28 15 18 2 percetange 0.01
 34 21 46 25 69 06 2 percetange 0.011
 21 16 27 58 69 34 2 percetange 0.012
 16 16 15 48 : 21 2 percetange 0.013
 07 07 15 30 69 43 2 percetange 0.014
 21 21 56 28 69 11 2 percetange 0.015
 07 15 40 41 69 16 2 percetange 0.016
 12 12 15 16 21 23 2 percetange 0.017
 08 21 21 34 40 10 2 percetange 0.018
 57 25 03 21 01 16 2 percetange 0.019
 34 34 49 58 69 15 2 percetange 0.02
 08 08 10 30 69 01 2 percetange 0.021
 10 04 21 36 27 16 2 percetange 0.022
 21 45 15 44 58 14 2 percetange 0.023
 65 54 67 59 69 05 2 percetange 0.024
 15 11 21 55 27 14 2 percetange 0.025
 16 21 54 24 69 30 2 percetange 0.026
 10 15 21 65 14