# Use Word Segmentation to create a Custom Tokenizer

Here, instead of defining a custom tokenizer, we pre-tokenize the text using a segmenter's predicitons and upload that as a new dataset, creating a tokenizer that matches the resulting vocabulary.
E.g. if the text previously consisted of `l o o k t h e r e` and the segmenter split it into `look th ere` then that would be added to the new dataset, with `look`, `th` and `ere` added to the vocabulary.

We use the GPT-2 BabyLM phoneme model to create segmentations.

In [6]:
import sys
sys.path.append('../../')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

from src.preprocessing import DataPreprocessingParams, DataPreprocessor
from src.config import DatasetParams
from src.utils import setup
from src.evaluation.segmentation import GPT2Segmenter

MODEL_NAME = 'phonemetransformers/GPT2-85M-CHAR-PHON-SPACELESS'
DATASET_NAME = 'phonemetransformers/BabyLM-phonemized'

## Load Model and Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

dataset_params = DatasetParams(subconfig='strict', name=DATASET_NAME, is_phonemes=True, text_column='phonemized_utterance')
data_processing_params = DataPreprocessingParams(max_input_length=64, join_utts='static', remove_word_boundaries=True)
data_preprocessor = DataPreprocessor(data_processing_params, tokenizer=tokenizer, get_word_boundaries=True)
dataset = setup.load_dataset(dataset_params)
dataset = dataset['train'].map(data_preprocessor, batched=True, remove_columns=["text"])

Map:   0%|          | 0/11317151 [00:00<?, ? examples/s]

In [None]:
segmenter = GPT2Segmenter(model, tokenizer, dataset, max_sequence_length=256, stride=30, subsample=False)

In [None]:
# Pickle the segmenter
import pickle
with open('gpt2_segmenter.pkl', 'wb') as f:
    pickle.dump(segmenter, f)