## Install Everything You Need

In [None]:
!pip install tokenizers
!pip install transformers

## Load and Preprocess data

#### Helper Functions you might need



In [1]:
import regex as re
def basicPreprocess(text):
  try:
    processed_text = text.lower()
    processed_text = re.sub(r'\W +', ' ', processed_text)
  except Exception as e:
    print("Exception:",e,",on text:", text)
    return None
  return processed_text

### Convert CSV file to JSON

In [None]:
import csv
import json
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)

    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open('07-12-08-10.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

# Specify the path to your CSV file
csv_file_path = './data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010-08-10-2023.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
print(json_data)


### Convert JSON to Prompt List and llama input JSON

In [273]:
from dateutil.parser import parse
from datetime import datetime
import json
stats_file = "./data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = {}
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        dt = parse(data['Draw Date'])
        powerball[dt] = data['Winning Numbers']

res = {key: val for key, val in sorted(powerball.items(), key = lambda ele: ele[0])}
pre_numbers = None
for dt in powerball:
    if pre_numbers is not None:
        lines.append(f"{pre_numbers} : {powerball[dt]}\n")
    pre_numbers = powerball[dt]
with open(f'./models/powerball2/prompts.txt', 'w', encoding='utf-8') as f:
    f.writelines(lines)
    f.close()

{datetime.datetime(2010, 2, 3, 0, 0): '17 22 36 37 52 24', datetime.datetime(2010, 2, 6, 0, 0): '14 22 52 54 59 04', datetime.datetime(2010, 2, 10, 0, 0): '05 08 29 37 38 34', datetime.datetime(2010, 2, 13, 0, 0): '10 14 30 40 51 01', datetime.datetime(2010, 2, 17, 0, 0): '07 08 19 26 36 15', datetime.datetime(2010, 2, 20, 0, 0): '13 27 37 41 54 32', datetime.datetime(2010, 2, 24, 0, 0): '04 17 35 50 57 12', datetime.datetime(2010, 2, 27, 0, 0): '18 47 51 53 58 30', datetime.datetime(2010, 3, 3, 0, 0): '07 09 14 45 49 23', datetime.datetime(2010, 3, 6, 0, 0): '10 29 33 41 59 15', datetime.datetime(2010, 3, 10, 0, 0): '17 21 37 41 50 01', datetime.datetime(2010, 3, 13, 0, 0): '06 16 20 31 36 08', datetime.datetime(2010, 3, 17, 0, 0): '24 26 45 48 55 08', datetime.datetime(2010, 3, 20, 0, 0): '09 36 39 44 45 09', datetime.datetime(2010, 3, 24, 0, 0): '14 20 24 39 49 07', datetime.datetime(2010, 3, 27, 0, 0): '07 21 32 44 52 10', datetime.datetime(2010, 3, 31, 0, 0): '05 13 17 45 54 12', 

In [7]:

stats_file = "./data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = []
test = []
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        powerball.append(
            {
            'instruction':f"what are the powerball numbers from {data['Draw Date']}",
            "output":data['Winning Numbers']
            }
        )
        lines.append(f"{data['Draw Date']} : {data['Winning Numbers']}\n")
    with open(f'./models/powerball/prompts.json', 'w', encoding='utf-8') as f:
        json.dump(powerball, f, ensure_ascii=True, indent=4, allow_nan=True)
        f.close()
    with open(f'./models/powerball/prompts.txt', 'w', encoding='utf-8') as f:
        f.writelines(lines)
        f.close()

## Train a custom tokenizer


In [1]:
import torch
torch.cuda.is_available()

True

In [277]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(BPE(unk_token="<unk>"))

tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=[    
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])

tokenizer.train(files=["./models/powerball/prompts.txt"], trainer=trainer)
tokenizer.save("./models/powerball/tokenizer/powerball.json")

output = tokenizer.encode("02 04 54 61 62 14 : 15 45 64 67 68 18")
print(output.tokens)


['02', '04', '54', '61', '62', '14', ':', '15', '45', '64', '67', '68', '18']


## Train a Transformer Model

In [1]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=128,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [2]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="./models/powerball/tokenizer/powerball.json")

In [3]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [4]:
model.num_parameters()

43614848

In [5]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./models/powerball2/prompts.txt",
    block_size=128,
)



In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models/powerball2",
    overwrite_output_dir=True,
    num_train_epochs=220000,
    per_device_train_batch_size=1024,
    save_steps=5000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

NameError: name 'data_collator' is not defined

In [None]:
trainer.train()

In [57]:
trainer.train(resume_from_checkpoint="./models/powerball2/checkpoint-240000")



  0%|          | 0/240000 [00:00<?, ?it/s]

{'loss': 0.0011, 'learning_rate': 3.739583333333334e-05, 'epoch': 30250.0}
{'loss': 0.001, 'learning_rate': 3.729166666666667e-05, 'epoch': 30500.0}
{'loss': 0.0011, 'learning_rate': 3.71875e-05, 'epoch': 30750.0}
{'loss': 0.0013, 'learning_rate': 3.708333333333334e-05, 'epoch': 31000.0}
{'loss': 0.0014, 'learning_rate': 3.697916666666667e-05, 'epoch': 31250.0}
{'loss': 0.0012, 'learning_rate': 3.6875e-05, 'epoch': 31500.0}
{'loss': 0.0016, 'learning_rate': 3.677083333333334e-05, 'epoch': 31750.0}
{'loss': 0.0014, 'learning_rate': 3.6666666666666666e-05, 'epoch': 32000.0}
{'loss': 0.0016, 'learning_rate': 3.65625e-05, 'epoch': 32250.0}
{'loss': 0.0017, 'learning_rate': 3.6458333333333336e-05, 'epoch': 32500.0}
{'loss': 0.0018, 'learning_rate': 3.6354166666666665e-05, 'epoch': 32750.0}
{'loss': 0.0015, 'learning_rate': 3.625e-05, 'epoch': 33000.0}
{'loss': 0.0016, 'learning_rate': 3.6145833333333336e-05, 'epoch': 33250.0}
{'loss': 0.0015, 'learning_rate': 3.604166666666667e-05, 'epoch':

KeyboardInterrupt: 

In [43]:
trainer.save_model("./models/powerball2/")

In [5]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./models/powerball2/checkpoint-240000",
    tokenizer=tokenizer,
    top_k=20,
)

fill_text = pipeline(
    "text-generation",
    model="./models/powerball2/checkpoint-240000",
    tokenizer=tokenizer
)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [14]:
import json
from operator import itemgetter
from collections import OrderedDict
import collections
reg_picks = {}
powerball = {}
sample_cout = 15000
count = 0
predictions = []
for i in range(0, sample_cout):
    prompt = "20 22 26 28 63 05 ::"
    prediction = fill_text(prompt, max_new_tokens=6, top_k=50, do_sample=True, temperature=1.0, no_repeat_ngram_size=1)[0]["generated_text"]
    prediction = prediction.replace(prompt, "")
    prediction = prediction.strip()
    prediction = prediction.replace(":", "")
    _prediction = prediction.strip()
    prediction = _prediction.split(" ")
    if int(prediction[-1]) <= 29:
        predictions.append(_prediction)
        count += 1
        if prediction[-1] not in powerball:
            powerball[prediction[-1]] = 0
        powerball[prediction[-1]] += 1
        for p in prediction[:-1]:
            if p not in reg_picks:
                reg_picks[p] = 1
            else:
                reg_picks[p] += 1
elements_count = collections.Counter(predictions)
sortedDict = sorted(elements_count.items(), key=lambda x:x[1])
for key in sortedDict:
   print(f"{key}")
sorted_x = OrderedDict(sorted(powerball.items(), key=itemgetter(1)))
print(len(sorted_x))
for pick in sorted_x:
    print(f"pick: {pick} % {sorted_x[pick]/count*100}")
sorted_x = OrderedDict(sorted(reg_picks.items(), key=itemgetter(1)))
print(len(sorted_x))
for pick in sorted_x:
    print(f"pick: {pick} {sorted_x[pick]/(count*5)*100} %")

01 40 64 52 66 02: 1
04 12 27 32 44 11: 1
03 57 18 59 39 24: 1
19 07 39 56 64 08: 3
10 27 13 07 33 24: 1
23 30 37 45 56 08: 1
04 12 64 15 47 08: 1
23 19 37 55 56 08: 1
07 23 40 69 27 12: 1
23 10 18 17 39 08: 1
19 16 15 10 51 08: 1
23 12 13 60 66 06: 2
01 32 37 46 52 08: 1
04 12 39 07 44 08: 1
23 21 39 56 64 08: 1
01 35 08 55 56 24: 1
01 41 25 53 07 24: 1
04 12 01 60 15 07: 1
23 12 09 42 64 08: 1
23 10 39 12 19 08: 2
27 12 23 47 67 07: 1
23 16 19 56 24 08: 2
23 10 27 19 13 07: 1
23 34 27 59 57 02: 1
01 12 10 07 04 02: 1
01 18 39 56 64 08: 1
23 04 40 07 51 08: 1
23 19 08 50 56 07: 1
23 35 31 56 43 08: 1
23 49 55 54 58 08: 1
23 35 16 19 49 07: 1
02 12 01 41 43 24: 1
18 12 39 08 64 06: 1
23 16 19 56 60 08: 1
23 10 46 01 67 08: 1
13 65 56 64 67 02: 1
01 12 69 21 39 08: 1
23 30 49 56 58 08: 1
01 32 66 67 12 02: 1
27 18 59 56 69 16: 1
41 12 46 56 01 08: 1
01 07 08 60 52 09: 1
04 12 13 47 42 08: 1
23 12 62 45 19 02: 1
01 12 04 69 64 08: 1
23 10 17 01 32 08: 1
23 40 39 60 55 06: 1
04 27 18 19 6

In [28]:
sortedDict = sorted(elements_count)


('01 40 64 52 66 02', 1)
('04 12 27 32 44 11', 1)
('03 57 18 59 39 24', 1)
('10 27 13 07 33 24', 1)
('23 30 37 45 56 08', 1)
('04 12 64 15 47 08', 1)
('23 19 37 55 56 08', 1)
('07 23 40 69 27 12', 1)
('23 10 18 17 39 08', 1)
('19 16 15 10 51 08', 1)
('01 32 37 46 52 08', 1)
('04 12 39 07 44 08', 1)
('23 21 39 56 64 08', 1)
('01 35 08 55 56 24', 1)
('01 41 25 53 07 24', 1)
('04 12 01 60 15 07', 1)
('23 12 09 42 64 08', 1)
('27 12 23 47 67 07', 1)
('23 10 27 19 13 07', 1)
('23 34 27 59 57 02', 1)
('01 12 10 07 04 02', 1)
('01 18 39 56 64 08', 1)
('23 04 40 07 51 08', 1)
('23 19 08 50 56 07', 1)
('23 35 31 56 43 08', 1)
('23 49 55 54 58 08', 1)
('23 35 16 19 49 07', 1)
('02 12 01 41 43 24', 1)
('18 12 39 08 64 06', 1)
('23 16 19 56 60 08', 1)
('23 10 46 01 67 08', 1)
('13 65 56 64 67 02', 1)
('01 12 69 21 39 08', 1)
('23 30 49 56 58 08', 1)
('01 32 66 67 12 02', 1)
('27 18 59 56 69 16', 1)
('41 12 46 56 01 08', 1)
('01 07 08 60 52 09', 1)
('04 12 13 47 42 08', 1)
('23 12 62 45 19 02', 1)


In [None]:
fill_mask("03 09 21 24 29 14 : 13 20 <mask> 33 59 20") #03 09 21 24 29 14 : 13 20 31 33 59 20

In [None]:
fill_mask("03 09 21 24 29 14 : 13 20 31 33 59 <mask>")

In [30]:
import json
from operator import itemgetter
from collections import OrderedDict
f = open("./07-12-08-10.json")
numbers = json.load(f)
numbers.sort(key = itemgetter('Draw Date'), reverse=False)
pre_winners = ""
i = 0
for drawing in numbers:
    if i < 1:
        pre_winners = drawing["Winning Numbers"]
        i += 1
    else:
        print(drawing["Draw Date"])
        winning_numbers = drawing["Winning Numbers"]
        winning_numbers = winning_numbers.split(" ")
        prompt = f"{pre_winners} ::"
        winning_picks = []
        power_ball_picks = []
        power_ball = {}
        reg_picks = {}
        win_picks = {}
        sample_cout = 1000
        p_count = 0
        predictions = []
        for i in range(0, sample_cout):
            prediction = fill_text(prompt, max_new_tokens=6, top_k=50, do_sample=True, temperature=1.0, no_repeat_ngram_size=1)[0]["generated_text"]
            prediction = prediction.replace(prompt, "")
            prediction = prediction.strip()
            prediction = prediction.replace(":", "")
            prediction_ = prediction.strip()
            prediction = prediction_.split(" ")
            if int(prediction[-1]) <= 29:
                p_count+=1
                predictions.append(prediction_)
                if prediction[-1] not in power_ball:
                    power_ball[prediction[-1]] = 1
                else:
                    power_ball[prediction[-1]] += 1
                count = 0
                if winning_numbers[-1] in prediction[-1]:
                    power_ball_picks.append(winning_numbers)
                    count+=1
                    # print(f"{winning_numbers} power_ball {prediction} correct count: {count}")
                for p in prediction[:-1]:
                    if p not in reg_picks:
                        reg_picks[p] = 1
                    else:
                        reg_picks[p] += 1
                for w in winning_numbers[:-1]:
                    if w in prediction[:-1]:
                        if w not in win_picks:
                            win_picks[w]=0
                        win_picks[w]+=1
                        count+=1
                if count > 2:
                    winning_picks.append(prediction)
                    print(f"{winning_numbers} winning_picks {prediction} correct count: {count}")
        pre_winners = winning_numbers
        print("==========================")
        print(f"percetange {len(winning_picks)/p_count*100}")
        print(f"percetange power ball {len(power_ball_picks)/p_count*100}")
        print("==========================")
        elements_count = collections.Counter(predictions)
        sortedDict = sorted(elements_count.items(), key=lambda x:x[1])
        for key, value in sortedDict:
            correct = 0
            for w in winning_numbers:
                if w in key:
                    correct += 1
            print(f"{key}: {value} correct = {correct}")             
        sorted_x = OrderedDict(sorted(power_ball.items(), key=itemgetter(1)))
        print(len(sorted_x))
        for pick in sorted_x:
            if pick in winning_numbers[-1]:
                print(f"pick: {pick} % {sorted_x[pick]/p_count*100}!!!!!")
            else:
                print(f"pick: {pick} % {sorted_x[pick]/p_count*100} diff {int(winning_numbers[-1]) - int(pick)}")
        
        print("==========================")
        sorted_x = OrderedDict(sorted(reg_picks.items(), key=itemgetter(1)))
        print(len(sorted_x))
        for pick in sorted_x:
            if pick in winning_numbers[:-1]:
                print(f"pick: {pick} {sorted_x[pick]/(p_count*5)*100} % !!!!")
            else:
                print(f"pick: {pick} {sorted_x[pick]/(p_count*5)*100} %")

07/15/2023
['02', '09', '43', '55', '57', '18'] winning_picks ['02', '58', '09', '50', '55', '25'] correct count: 3
['02', '09', '43', '55', '57', '18'] winning_picks ['01', '16', '32', '09', '43', '18'] correct count: 3
percetange 0.20325203252032523
percetange power ball 2.642276422764228
16 02 53 58 57 12: 1 correct = 2
11 34 47 16 58 12: 1 correct = 0
10 58 36 53 52 16: 1 correct = 0
11 47 53 30 58 12: 1 correct = 0
09 34 13 54 16 12: 1 correct = 1
11 34 22 16 58 12: 1 correct = 0
34 40 58 50 47 12: 1 correct = 0
16 34 09 58 49 12: 1 correct = 1
34 58 47 14 57 12: 1 correct = 1
27 34 47 69 16 12: 1 correct = 0
40 58 41 69 54 12: 1 correct = 0
16 29 36 47 58 12: 1 correct = 0
16 40 11 53 58 18: 1 correct = 1
11 34 58 16 44 12: 1 correct = 0
16 29 22 58 53 17: 1 correct = 0
01 37 13 47 57 12: 1 correct = 1
16 24 30 56 58 07: 1 correct = 0
47 34 55 53 58 16: 1 correct = 1
16 34 38 58 30 12: 1 correct = 0
47 34 69 58 54 12: 1 correct = 0
16 29 48 50 58 17: 1 correct = 0
27 60 29 54 58 

KeyboardInterrupt: 