In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import json

import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer
from huggingface_hub import interpreter_login

In [3]:
# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("json", data_files="./twenty_years_of_baseball_structed_train_lines.json", field='data', split='all')

Found cached dataset json (C:/Users/danm/.cache/huggingface/datasets/json/default-7955246062e9a510/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [4]:
raw_datasets['train']

['question: {"input": {"pitcher": {"id": 460024, "name": "luke hochevar"}, "batter": {"id": 430895, "name": "maicer izturis"}, "p_throws": "R", "stand": "L", "inning_topbot": "Top", "inning": 1, "outs_when_up": 0, "on_1b": "", "on_2b": "", "on_3b": "", "home_score": 0, "away_score": 0}}? \n output: {"result": {"event": "field_out", "type": "X", "zone": "", "des": "Maicer Izturis grounds out, second baseman Chris Getz to first baseman Kila Ka\'aihue.", "at_bat_number": 1, "pitch_number": 4, "pitch_name": [NaN, NaN, NaN, NaN], "hit_location": 4, "launch_speed": "", "launch_speed_angle": "", "runs_scored": 0, "at_bat": ["ball", "ball", "called_strike", "hit_into_play"], "pitch_type": [NaN, NaN, NaN, NaN], "release_speed": ["", "", "", ""]}}\n',
 'question: {"input": {"pitcher": {"id": 460024, "name": "luke hochevar"}, "batter": {"id": 435062, "name": "howie kendrick"}, "p_throws": "R", "stand": "R", "inning_topbot": "Top", "inning": 1, "outs_when_up": 1, "on_1b": "", "on_2b": "", "on_3b":

In [39]:
"{\"pitcher\": {\"id\": 460024, \"name\": \"luke hochevar\"}, \"batter\": {\"id\": 488721, \"name\": \"peter bourjos\"}, \"p_throws\": \"R\", \"stand\": \"R\", \"inning_topbot\": \"Top\", \"inning\": 2, \"outs_when_up\": 2, \"on_1b\": \"\", \"on_2b\": \"\", \"on_3b\": \"\", \"home_score\": 0, \"away_score\": 0}"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )


training_corpus = get_training_corpus()

In [6]:
old_tokenizer = AutoTokenizer.from_pretrained("../models/phi-2", trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 55000)

In [65]:
example = "{\"pitcher\": {\"id\": 460024, \"name\": \"luke hochevar\"}, \"batter\": {\"id\": 488721, \"name\": \"peter bourjos\"}, \"p_throws\": \"R\", \"stand\": \"R\", \"inning_topbot\": \"Top\", \"inning\": 2, \"outs_when_up\": 2, \"on_1b\": \"\", \"on_2b\": \"\", \"on_3b\": \"\", \"home_score\": 0, \"away_score\": 0}"
tokens = tokenizer.encode(example)
print(len(tokenizer.tokenize(example)))
print(len(old_tokenizer.tokenize(example)))

['{', '"', 'p', 'i', 't', 'c', 'h', 'e', 'r', '"', ':', 'Ġ', '{', '"', 'i', 'd', '"', ':', 'Ġ', '4', '6', '0', '0', '2', '4', ',', 'Ġ', '"', 'n', 'a', 'm', 'e', '"', ':', 'Ġ', '"', 'l', 'u', 'k', 'e', 'Ġ', 'h', 'o', 'c', 'h', 'e', 'v', 'a', 'r', '"', '}', ',', 'Ġ', '"', 'b', 'a', 't', 't', 'e', 'r', '"', ':', 'Ġ', '{', '"', 'i', 'd', '"', ':', 'Ġ', '4', '8', '8', '7', '2', '1', ',', 'Ġ', '"', 'n', 'a', 'm', 'e', '"', ':', 'Ġ', '"', 'p', 'e', 't', 'e', 'r', 'Ġ', 'b', 'o', 'u', 'r', 'j', 'o', 's', '"', '}', ',', 'Ġ', '"', 'p', '_', 't', 'h', 'r', 'o', 'w', 's', '"', ':', 'Ġ', '"', 'R', '"', ',', 'Ġ', '"', 's', 't', 'a', 'n', 'd', '"', ':', 'Ġ', '"', 'R', '"', ',', 'Ġ', '"', 'i', 'n', 'n', 'i', 'n', 'g', '_', 't', 'o', 'p', 'b', 'o', 't', '"', ':', 'Ġ', '"', 'T', 'o', 'p', '"', ',', 'Ġ', '"', 'i', 'n', 'n', 'i', 'n', 'g', '"', ':', 'Ġ', '2', ',', 'Ġ', '"', 'o', 'u', 't', 's', '_', 'w', 'h', 'e', 'n', '_', 'u', 'p', '"', ':', 'Ġ', '2', ',', 'Ġ', '"', 'o', 'n', '_', '1', 'b', '"', ':', 'Ġ',

In [62]:
tokenizer.save_pretrained("../models/phi-2-mlb/tokenizer/")

('../models/phi-2-mlb/tokenizer/tokenizer_config.json',
 '../models/phi-2-mlb/tokenizer/special_tokens_map.json',
 '../models/phi-2-mlb/tokenizer/vocab.json',
 '../models/phi-2-mlb/tokenizer/merges.txt',
 '../models/phi-2-mlb/tokenizer/added_tokens.json',
 '../models/phi-2-mlb/tokenizer/tokenizer.json')

In [63]:
print(tokens)

[91, 2, 80, 73, 84, 67, 72, 69, 82, 2, 26, 221, 91, 2, 73, 68, 2, 26, 221, 20, 22, 16, 16, 18, 20, 12, 221, 2, 78, 65, 77, 69, 2, 26, 221, 2, 76, 85, 75, 69, 221, 72, 79, 67, 72, 69, 86, 65, 82, 2, 93, 12, 221, 2, 66, 65, 84, 84, 69, 82, 2, 26, 221, 91, 2, 73, 68, 2, 26, 221, 20, 24, 24, 23, 18, 17, 12, 221, 2, 78, 65, 77, 69, 2, 26, 221, 2, 80, 69, 84, 69, 82, 221, 66, 79, 85, 82, 74, 79, 83, 2, 93, 12, 221, 2, 80, 63, 84, 72, 82, 79, 87, 83, 2, 26, 221, 2, 50, 2, 12, 221, 2, 83, 84, 65, 78, 68, 2, 26, 221, 2, 50, 2, 12, 221, 2, 73, 78, 78, 73, 78, 71, 63, 84, 79, 80, 66, 79, 84, 2, 26, 221, 2, 52, 79, 80, 2, 12, 221, 2, 73, 78, 78, 73, 78, 71, 2, 26, 221, 18, 12, 221, 2, 79, 85, 84, 83, 63, 87, 72, 69, 78, 63, 85, 80, 2, 26, 221, 18, 12, 221, 2, 79, 78, 63, 17, 66, 2, 26, 221, 2, 2, 12, 221, 2, 79, 78, 63, 18, 66, 2, 26, 221, 2, 2, 12, 221, 2, 79, 78, 63, 19, 66, 2, 26, 221, 2, 2, 12, 221, 2, 72, 79, 77, 69, 63, 83, 67, 79, 82, 69, 2, 26, 221, 16, 12, 221, 2, 65, 87, 65, 89, 63, 83, 