In [1]:
%pip install transformers peft datasets huggingface_hub tqdm 

Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting peft
  Downloading peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl.metadata (20 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m961.1 kB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting torch>=1.13.0 (from peft)
  Downloading torch-2.2.0-cp310-none-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.27.0-py3-none-any.whl.metadata (18 

In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import json

import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig
from transformers import (
    AutoTokenizer,
)
from tqdm.notebook import tqdm

from huggingface_hub import interpreter_login

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("json", data_files="../data/2011_2023_phi-2_struct_encoded.json", field='train', split='all')

Generating train split: 2011347 examples [00:38, 52595.71 examples/s]


In [None]:
raw_datasets['text']

In [11]:
model_type = "../models/phi-2"

In [12]:
def get_training_corpus():
    return (
        raw_datasets["text"][i : i + 1000]
        for i in range(0, len(raw_datasets["text"]), 1000)
    )


training_corpus = get_training_corpus()

In [13]:
old_tokenizer = AutoTokenizer.from_pretrained(model_type, trust_remote_code=True)
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 51000)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
example = '''Instruct: what is the outcome of {"input": {"pitcher": {"id": 460024, "name": "luke hochevar"}, "batter": {"id": 430895, "name": "maicer izturis"}}}? \n Output: {"result": {"event": "field_out", "des": "Maicer Izturis grounds out softly, second baseman Chris Getz to first baseman Kila Ka\'aihue."}}\n'''
tokens = tokenizer.tokenize(example)
old_tokens = old_tokenizer.tokenize(example)
print(len(tokens))
print(tokens)
print(len(old_tokens))
print(old_tokens)

77
['Instruct', ':', 'Ġwhat', 'Ġis', 'Ġthe', 'Ġoutcome', 'Ġof', 'Ġ{"', 'input', '":', 'Ġ{"', 'pitcher', '":', 'Ġ{"', 'id', '":', 'Ġ460024', ',', 'Ġ"', 'name', '":', 'Ġ"', 'luke', 'Ġhochevar', '"},', 'Ġ"', 'batter', '":', 'Ġ{"', 'id', '":', 'Ġ430895', ',', 'Ġ"', 'name', '":', 'Ġ"', 'maicer', 'Ġizturis', '"}}}?', 'ĠĊ', 'ĠOutput', ':', 'Ġ{"', 'result', '":', 'Ġ{"', 'event', '":', 'Ġ"', 'field', '_', 'out', '",', 'Ġ"', 'des', '":', 'Ġ"', 'Maicer', 'ĠIzturis', 'Ġgrounds', 'Ġout', 'Ġsoftly', ',', 'Ġsecond', 'Ġbaseman', 'ĠChris', 'ĠGetz', 'Ġto', 'Ġfirst', 'Ġbaseman', 'ĠKila', 'ĠKa', "'", 'aihue', '."}}', 'Ċ']
90
['Instruc', 't', ':', 'Ġwhat', 'Ġis', 'Ġthe', 'Ġoutcome', 'Ġof', 'Ġ{"', 'input', '":', 'Ġ{"', 'pitcher', '":', 'Ġ{"', 'id', '":', 'Ġ46', '00', '24', ',', 'Ġ"', 'name', '":', 'Ġ"', 'luke', 'Ġh', 'ochevar', '"},', 'Ġ"', 'batter', '":', 'Ġ{"', 'id', '":', 'Ġ430', '895', ',', 'Ġ"', 'name', '":', 'Ġ"', 'maicer', 'Ġi', 'zturis', '"}}}?', 'Ġ', 'Ċ', 'ĠOutput', ':', 'Ġ{"', 'result', '":', 'Ġ{"

In [20]:
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModel

print("Before adding mlb:", len(old_tokenizer))

tokens_in_mlb_not_in_phio_2 = set(tokenizer.vocab).difference(old_tokenizer.vocab)
old_tokenizer.add_tokens(list(tokens_in_mlb_not_in_phio_2))

print("After adding mlb:", len(old_tokenizer))


model = AutoModel.from_pretrained(model_type)
model.resize_token_embeddings(len(tokenizer))

old_tokenizer.save_pretrained("../models/phi-2-mlb/tokenizer_merged")

Before adding mlb: 60359
After adding mlb: 60359


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('../models/phi-2-mlb/tokenizer_merged\\tokenizer_config.json',
 '../models/phi-2-mlb/tokenizer_merged\\special_tokens_map.json',
 '../models/phi-2-mlb/tokenizer_merged\\vocab.json',
 '../models/phi-2-mlb/tokenizer_merged\\merges.txt',
 '../models/phi-2-mlb/tokenizer_merged\\added_tokens.json',
 '../models/phi-2-mlb/tokenizer_merged\\tokenizer.json')

In [21]:
model.save_pretrained("../models/phi-2-mlb", safe_serialization=True, max_shard_size='4GB')