## Install Everything You Need

In [None]:
!pip install tokenizers
!pip install transformers

### Convert CSV file to JSON

In [None]:
import csv
import json
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)

    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open('07-12-08-10.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

# Specify the path to your CSV file
csv_file_path = './data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010-08-10-2023.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
print(json_data)


### Convert JSON to Prompt List and llama input JSON

In [273]:
from dateutil.parser import parse
from datetime import datetime
import json
stats_file = "./data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = {}
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        dt = parse(data['Draw Date'])
        powerball[dt] = data['Winning Numbers']

res = {key: val for key, val in sorted(powerball.items(), key = lambda ele: ele[0])}
pre_numbers = None
for dt in powerball:
    if pre_numbers is not None:
        lines.append(f"{pre_numbers} : {powerball[dt]}\n")
    pre_numbers = powerball[dt]
with open(f'./models/powerball2/prompts.txt', 'w', encoding='utf-8') as f:
    f.writelines(lines)
    f.close()

{datetime.datetime(2010, 2, 3, 0, 0): '17 22 36 37 52 24', datetime.datetime(2010, 2, 6, 0, 0): '14 22 52 54 59 04', datetime.datetime(2010, 2, 10, 0, 0): '05 08 29 37 38 34', datetime.datetime(2010, 2, 13, 0, 0): '10 14 30 40 51 01', datetime.datetime(2010, 2, 17, 0, 0): '07 08 19 26 36 15', datetime.datetime(2010, 2, 20, 0, 0): '13 27 37 41 54 32', datetime.datetime(2010, 2, 24, 0, 0): '04 17 35 50 57 12', datetime.datetime(2010, 2, 27, 0, 0): '18 47 51 53 58 30', datetime.datetime(2010, 3, 3, 0, 0): '07 09 14 45 49 23', datetime.datetime(2010, 3, 6, 0, 0): '10 29 33 41 59 15', datetime.datetime(2010, 3, 10, 0, 0): '17 21 37 41 50 01', datetime.datetime(2010, 3, 13, 0, 0): '06 16 20 31 36 08', datetime.datetime(2010, 3, 17, 0, 0): '24 26 45 48 55 08', datetime.datetime(2010, 3, 20, 0, 0): '09 36 39 44 45 09', datetime.datetime(2010, 3, 24, 0, 0): '14 20 24 39 49 07', datetime.datetime(2010, 3, 27, 0, 0): '07 21 32 44 52 10', datetime.datetime(2010, 3, 31, 0, 0): '05 13 17 45 54 12', 

In [7]:

stats_file = "./data_sources/Lottery_Powerball_Winning_Numbers__Beginning_2010.json"
powerball = []
test = []
lines = []
with open(stats_file, 'r') as f:
    stats = json.load(f)
    for data in stats:
        powerball.append(
            {
            'instruction':f"what are the powerball numbers from {data['Draw Date']}",
            "output":data['Winning Numbers']
            }
        )
        lines.append(f"{data['Draw Date']} : {data['Winning Numbers']}\n")
    with open(f'./models/powerball/prompts.json', 'w', encoding='utf-8') as f:
        json.dump(powerball, f, ensure_ascii=True, indent=4, allow_nan=True)
        f.close()
    with open(f'./models/powerball/prompts.txt', 'w', encoding='utf-8') as f:
        f.writelines(lines)
        f.close()

In [62]:
import json
import random
sample = []
with open("./data_sources/lora_training_data.json", "r") as f:
    data = json.load(f)
    for d in data:
        if random.random()*500000 < 15122:
            sample.append(f"###instruction:{d['instruction']} ###input {d['input']} ###response")


## Train a custom tokenizer


In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(BPE(unk_token="<unk>"))

tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=[    
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])

tokenizer.train(files=["./data_sources/lora_training_data.txt"], trainer=trainer)
tokenizer.save("./models/mlb/tokenizer/powerball.json")

output = tokenizer.encode("what is the outcome of pitcher 461833 pitching to batter 435079")
print(output.tokens)


## Train a Transformer Model

In [1]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [2]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="./models/mlb/tokenizer/powerball.json")

In [3]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [4]:
model.num_parameters()

66586416

In [5]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./data_sources/lora_training_data.txt",
    block_size=128,
)



In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [31]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models/mlb",
    overwrite_output_dir=True,
    num_train_epochs=500000,
    per_device_train_batch_size=128,
    save_steps=5000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [13]:
trainer.train()



  0%|          | 0/1024 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.0837, 'learning_rate': 2.55859375e-05, 'epoch': 500.0}
{'loss': 0.0035, 'learning_rate': 1.1718750000000001e-06, 'epoch': 1000.0}
{'train_runtime': 47.5922, 'train_samples_per_second': 21.516, 'train_steps_per_second': 21.516, 'train_loss': 0.5309643920336384, 'epoch': 1024.0}


TrainOutput(global_step=1024, training_loss=0.5309643920336384, metrics={'train_runtime': 47.5922, 'train_samples_per_second': 21.516, 'train_steps_per_second': 21.516, 'train_loss': 0.5309643920336384, 'epoch': 1024.0})

In [44]:
trainer.train(resume_from_checkpoint="./models/mlb/checkpoint-252000")

  0%|          | 0/5000000 [00:00<?, ?it/s]

{'loss': 0.011, 'learning_rate': 3.7762910000000005e-05, 'epoch': 500.0}
{'loss': 0.0, 'learning_rate': 3.775791e-05, 'epoch': 1000.0}
{'loss': 0.0028, 'learning_rate': 3.775291e-05, 'epoch': 1500.0}
{'loss': 0.003, 'learning_rate': 3.7747910000000006e-05, 'epoch': 2000.0}
{'loss': 0.0017, 'learning_rate': 3.774291e-05, 'epoch': 2500.0}
{'loss': 0.0028, 'learning_rate': 3.7737910000000004e-05, 'epoch': 3000.0}
{'loss': 0.0, 'learning_rate': 3.773291e-05, 'epoch': 3500.0}
{'loss': 0.0037, 'learning_rate': 3.772791e-05, 'epoch': 4000.0}
{'loss': 0.0002, 'learning_rate': 3.772291e-05, 'epoch': 4500.0}
{'loss': 0.0021, 'learning_rate': 3.771791e-05, 'epoch': 5000.0}
{'loss': 0.001, 'learning_rate': 3.7712910000000003e-05, 'epoch': 5500.0}
{'loss': 0.0, 'learning_rate': 3.770791e-05, 'epoch': 6000.0}
{'loss': 0.0, 'learning_rate': 3.770291e-05, 'epoch': 6500.0}
{'loss': 0.0, 'learning_rate': 3.7697910000000004e-05, 'epoch': 7000.0}
{'loss': 0.0, 'learning_rate': 3.769291e-05, 'epoch': 7500.

KeyboardInterrupt: 

In [34]:
trainer.save_model("./models/mlb/")

In [48]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./models/mlb/checkpoint-252000",
    tokenizer=tokenizer,
    top_k=20,
)

fill_text = pipeline(
    "text-generation",
    model="./models/mlb/checkpoint-252000",
    tokenizer=tokenizer
)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [61]:
# prompt = "instruction: what is the outcome of pitcher 460024 pitching to batter 116338 "
for s in sample:
    print(fill_text(s, max_new_tokens=50, top_k=22, temperature=1.5)[0]["generated_text"])

###instruction:what is the outcome of pitcher 282332 pitching to batter 340192 ###response a game 263816, 116539 116539 NYY NYY the instruction of outs 1 inning with with strike and 0 0 and a outs outs 277417,, a - handed pitcher, pitch throws 1 to handed 116539 and
###instruction:what is the outcome of pitcher 502085 pitching to batter 136770 ###response a game 263816, 116539 116539 NYY NYY the instruction instruction a a inning with with strike and 0 0 and a outs outs 277417,, a - handed pitcher, pitch throws 1 to handed 116539 and
###instruction:what is the outcome of pitcher 501955 pitching to batter 134181 ###response a game 263816, 116539 116539 NYY NYY the instruction instruction a a inning with with strike and 0 0 and a outs outs 277417,, a - handed pitcher, pitch throws 1 to handed 116539 and
###instruction:what is the outcome of pitcher 450306 pitching to batter 400083 ###response a game 263816, 116539 116539 NYY NYY the instruction of outs 1 inning with with strike and 0 0 a

KeyboardInterrupt: 

In [None]:
import json
from operator import itemgetter
from collections import OrderedDict
import collections
reg_picks = {}
powerball = {}
sample_cout = 15000
count = 0
predictions = []
for i in range(0, sample_cout):
    prompt = "20 22 26 28 63 05 ::"
    prediction = fill_text(prompt, max_new_tokens=6, top_k=50, do_sample=True, temperature=1.0, no_repeat_ngram_size=1)[0]["generated_text"]
    prediction = prediction.replace(prompt, "")
    prediction = prediction.strip()
    prediction = prediction.replace(":", "")
    _prediction = prediction.strip()
    prediction = _prediction.split(" ")
    if int(prediction[-1]) <= 29:
        predictions.append(_prediction)
        count += 1
        if prediction[-1] not in powerball:
            powerball[prediction[-1]] = 0
        powerball[prediction[-1]] += 1
        for p in prediction[:-1]:
            if p not in reg_picks:
                reg_picks[p] = 1
            else:
                reg_picks[p] += 1
elements_count = collections.Counter(predictions)
sortedDict = sorted(elements_count.items(), key=lambda x:x[1])
for key in sortedDict:
   print(f"{key}")
sorted_x = OrderedDict(sorted(powerball.items(), key=itemgetter(1)))
print(len(sorted_x))
for pick in sorted_x:
    print(f"pick: {pick} % {sorted_x[pick]/count*100}")
sorted_x = OrderedDict(sorted(reg_picks.items(), key=itemgetter(1)))
print(len(sorted_x))
for pick in sorted_x:
    print(f"pick: {pick} {sorted_x[pick]/(count*5)*100} %")

In [25]:
fill_text("###instruction: what is the outcome of pitcher 460024 pitching to batter 116338 ### input: bottom of the 9 inning with 2 on and no outs ### output :", max_length=128)

[{'generated_text': '###instruction: what is the outcome of pitcher 460024 pitching to batter 116338 ### input: bottom of the 9 inning with 2 on and no outs ### output : strike and 0 balls and 0 outs, 277417, a right - handed pitcher, pitches pitch number 1 to batter 116539, a right - handed batter 116539 hits with with with with with with with with with with with with with with with with with with with with with with with with with with with with with 277417 throws a 94 miles per hour Sinker and 116539, a right - handed batter, makes contact with a pitch from 277417 resulting in a field out ### instruction :'}]

In [None]:
sortedDict = sorted(elements_count)


In [None]:
fill_mask("03 09 21 24 29 14 : 13 20 <mask> 33 59 20") #03 09 21 24 29 14 : 13 20 31 33 59 20

In [None]:
fill_mask("03 09 21 24 29 14 : 13 20 31 33 59 <mask>")

In [49]:
import json
import random
from operator import itemgetter
from collections import OrderedDict
f = open("./07-12-08-10.json")
numbers = json.load(f)
numbers.sort(key = itemgetter('Draw Date'), reverse=False)
pre_winners = ""
i = 0
for drawing in numbers:
    if i < 1:
        pre_winners = drawing["Winning Numbers"]
        i += 1
    else:
        print(drawing["Draw Date"])
        winning_numbers = drawing["Winning Numbers"]
        winning_numbers = winning_numbers.split(" ")
        prompt = f"{pre_winners} ::"
        winning_picks = []
        power_ball_picks = []
        power_ball = {}
        reg_picks = {}
        win_picks = {}
        sample_cout = 10000
        p_count = 0
        predictions = []
        won = 0
        for i in range(0, sample_cout):
            did_win_power_ball = False
            prediction = fill_text(prompt, max_new_tokens=6, top_k=50, do_sample=True, temperature=random.random()*2.0, no_repeat_ngram_size=1)[0]["generated_text"]
            prediction = prediction.replace(prompt, "")
            prediction = prediction.strip()
            prediction = prediction.replace(":", "")
            prediction_ = prediction.strip()
            prediction = prediction_.split(" ")
            if int(prediction[-1]) <= 29:
                if prediction[-1] not in power_ball:
                    power_ball[prediction[-1]] = 1
                else:
                    power_ball[prediction[-1]] += 1
            predictions.append(prediction_)
            count = 0
            if winning_numbers[-1] in prediction[-1]:
                did_win_power_ball = True
                power_ball_picks.append(winning_numbers)
                count+=1
                # print(f"{winning_numbers} power_ball {prediction} correct count: {count}")
            for p in prediction[:-1]:
                if p not in reg_picks:
                    reg_picks[p] = 1
                else:
                    reg_picks[p] += 1
            for w in winning_numbers[:-1]:
                if w in prediction[:-1]:
                    if w not in win_picks:
                        win_picks[w]=0
                    win_picks[w]+=1
                    count+=1

            if did_win_power_ball:
                if count == 6:
                    winning_picks.append(prediction)
                    won += 100000000
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
                if count == 5:
                    winning_picks.append(prediction)
                    won += 50000
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
                if count == 4:
                    winning_picks.append(prediction)
                    won += 100
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
                if count == 3:
                    winning_picks.append(prediction)
                    won += 7
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
                if count == 2:
                    winning_picks.append(prediction)
                    won += 4
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
                if count == 1:
                    winning_picks.append(prediction)
                    won += 4
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
            else:
                if count == 5:
                    winning_picks.append(prediction)
                    won += 1000000
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
                if count == 4:
                    winning_picks.append(prediction)
                    won += 100
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")
                if count == 3:
                    winning_picks.append(prediction)
                    won += 7
                    print(f"won so far {won} cost {len(predictions)*2} profit/loss {won - len(predictions)*2}")

            
        pre_winners = winning_numbers
        print("==========================")
        print(f"percetange {len(winning_picks)/len(predictions)*100}")
        print(f"percetange power ball {len(power_ball_picks)/(len(predictions))*100}")
        print("==========================")
        elements_count = collections.Counter(predictions)
        sortedDict = sorted(elements_count.items(), key=lambda x:x[1], reverse=True) 
        for key, value in sortedDict:
            if value > 1:
                print(f"{key}: {value}")
        print(f"won={won} cost {len(prediction)*2}")             
        sorted_x = OrderedDict(sorted(power_ball.items(), key=itemgetter(1), reverse=True))
        for pick in sorted_x:
            if pick in winning_numbers[-1]:
                print(f"pick: {pick} % {sorted_x[pick]/len(predictions)*100}!!!!!")
            else:
                print(f"pick: {pick} % {sorted_x[pick]/len(predictions)*100} diff {int(winning_numbers[-1]) - int(pick)}")
        
        print("==========================")
        sorted_x = OrderedDict(sorted(reg_picks.items(), key=itemgetter(1), reverse=True))
        print(len(sorted_x))
        for pick in sorted_x:
            if pick in winning_numbers[:-1]:
                print(f"pick: {pick} {sorted_x[pick]/(len(predictions)*5)*100} % !!!!")
            else:
                print(f"pick: {pick} {sorted_x[pick]/(len(predictions)*5)*100} %")

07/15/2023
won so far 4 cost 6 profit/loss -2
won so far 8 cost 18 profit/loss -10
won so far 12 cost 34 profit/loss -22
won so far 16 cost 188 profit/loss -172
won so far 20 cost 206 profit/loss -186
won so far 24 cost 234 profit/loss -210
won so far 28 cost 294 profit/loss -266
won so far 32 cost 412 profit/loss -380
won so far 36 cost 484 profit/loss -448
won so far 40 cost 518 profit/loss -478
won so far 44 cost 756 profit/loss -712
won so far 48 cost 878 profit/loss -830
won so far 52 cost 932 profit/loss -880
won so far 56 cost 1014 profit/loss -958
won so far 60 cost 1022 profit/loss -962
won so far 64 cost 1054 profit/loss -990
won so far 68 cost 1076 profit/loss -1008
won so far 72 cost 1078 profit/loss -1006
won so far 76 cost 1084 profit/loss -1008
won so far 80 cost 1168 profit/loss -1088
won so far 87 cost 1176 profit/loss -1089
won so far 91 cost 1196 profit/loss -1105
won so far 95 cost 1246 profit/loss -1151
won so far 99 cost 1272 profit/loss -1173
won so far 103 cost 

KeyboardInterrupt: 