In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import json

import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer
from huggingface_hub import interpreter_login

In [4]:
# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("json", data_files="./2011_2023_event_des.json", field='data', split='all')

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
raw_datasets['train']

['Instruct: what is the outcome of {"input": {"pitcher": {"id": 460024, "name": "luke hochevar"}, "batter": {"id": 430895, "name": "maicer izturis"}}}? \n Output: {"result": {"event": "field_out", "des": "Maicer Izturis grounds out, second baseman Chris Getz to first baseman Kila Ka\'aihue."}}\n',
 'Instruct: what is the outcome of {"input": {"pitcher": {"id": 460024, "name": "luke hochevar"}, "batter": {"id": 435062, "name": "howie kendrick"}}}? \n Output: {"result": {"event": "double", "des": "Howie Kendrick doubles (1) on a line drive to left fielder Alex Gordon."}}\n',
 'Instruct: what is the outcome of {"input": {"pitcher": {"id": 460024, "name": "luke hochevar"}, "batter": {"id": 110029, "name": "bobby abreu"}}}? \n Output: {"result": {"event": "strikeout", "des": "Bobby Abreu called out on strikes."}}\n',
 'Instruct: what is the outcome of {"input": {"pitcher": {"id": 460024, "name": "luke hochevar"}, "batter": {"id": 116338, "name": "torii hunter"}}}? \n Output: {"result": {"ev

In [6]:
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )


training_corpus = get_training_corpus()

In [7]:
old_tokenizer = AutoTokenizer.from_pretrained("../models/phi-2", trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 55000)

In [65]:
example = "{\"pitcher\": {\"id\": 460024, \"name\": \"luke hochevar\"}, \"batter\": {\"id\": 488721, \"name\": \"peter bourjos\"}, \"p_throws\": \"R\", \"stand\": \"R\", \"inning_topbot\": \"Top\", \"inning\": 2, \"outs_when_up\": 2, \"on_1b\": \"\", \"on_2b\": \"\", \"on_3b\": \"\", \"home_score\": 0, \"away_score\": 0}"
tokens = tokenizer.tokenize(example)
old_tokens = old_tokenizer.tokenize(example)
print(len(tokens))
print(tokens)
print(old_tokens)

['{', '"', 'p', 'i', 't', 'c', 'h', 'e', 'r', '"', ':', 'Ġ', '{', '"', 'i', 'd', '"', ':', 'Ġ', '4', '6', '0', '0', '2', '4', ',', 'Ġ', '"', 'n', 'a', 'm', 'e', '"', ':', 'Ġ', '"', 'l', 'u', 'k', 'e', 'Ġ', 'h', 'o', 'c', 'h', 'e', 'v', 'a', 'r', '"', '}', ',', 'Ġ', '"', 'b', 'a', 't', 't', 'e', 'r', '"', ':', 'Ġ', '{', '"', 'i', 'd', '"', ':', 'Ġ', '4', '8', '8', '7', '2', '1', ',', 'Ġ', '"', 'n', 'a', 'm', 'e', '"', ':', 'Ġ', '"', 'p', 'e', 't', 'e', 'r', 'Ġ', 'b', 'o', 'u', 'r', 'j', 'o', 's', '"', '}', ',', 'Ġ', '"', 'p', '_', 't', 'h', 'r', 'o', 'w', 's', '"', ':', 'Ġ', '"', 'R', '"', ',', 'Ġ', '"', 's', 't', 'a', 'n', 'd', '"', ':', 'Ġ', '"', 'R', '"', ',', 'Ġ', '"', 'i', 'n', 'n', 'i', 'n', 'g', '_', 't', 'o', 'p', 'b', 'o', 't', '"', ':', 'Ġ', '"', 'T', 'o', 'p', '"', ',', 'Ġ', '"', 'i', 'n', 'n', 'i', 'n', 'g', '"', ':', 'Ġ', '2', ',', 'Ġ', '"', 'o', 'u', 't', 's', '_', 'w', 'h', 'e', 'n', '_', 'u', 'p', '"', ':', 'Ġ', '2', ',', 'Ġ', '"', 'o', 'n', '_', '1', 'b', '"', ':', 'Ġ',

In [None]:
tokenizer.save_pretrained("../models/phi-2-mlb/tokenizer/")