In [4]:
from transformers import AutoTokenizer

def analyze_tokens(text, model_name="qing-yao/babylm-balanced_seed-42_1e-3"):
    """
    Load a pretrained tokenizer and analyze tokens for given text.
    
    Args:
        text (str): Input text to tokenize
        model_name (str): Name of pretrained model from HuggingFace hub
        
    Returns:
        dict: Dictionary containing tokens, token IDs, and attention mask
    """
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Tokenize the text
    encoding = tokenizer(text, return_tensors="pt")
    
    # Decode each token ID to see the actual tokens
    tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
    
    # Create a readable output
    token_analysis = {
        'original_text': text,
        'tokens': tokens,
        'token_ids': encoding['input_ids'][0].tolist(),
        'attention_mask': encoding['attention_mask'][0].tolist()
    }
    
    # Print human-readable output
    print(f"\nOriginal text: {text}")
    print("\nTokens:", " ".join(tokens))
    print("\nToken IDs:", encoding['input_ids'][0].tolist())
    
    return token_analysis

# Example usage
text = "I love \"machines \"."
results = analyze_tokens(text)
text = "I love \"machines\"."
results = analyze_tokens(text)


Original text: I love "machines ".

Tokens: <s> i love " machines ".

Token IDs: [1, 12, 203, 21, 3760, 192]

Original text: I love "machines".

Tokens: <s> i love " machines ".

Token IDs: [1, 12, 203, 21, 3760, 192]


In [1]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

def custom_tokenizer(nlp):
        inf = list(nlp.Defaults.infixes)               # Default infixes
        inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
        inf = tuple(inf)                               # Convert inf to tuple
        infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
        infixes = [x for x in infixes if "-|–|—|--|---|——|~" not in x] # Remove - between letters rule
        infix_re = compile_infix_regex(infixes)

        return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                    suffix_search=nlp.tokenizer.suffix_search,
                                    infix_finditer=infix_re.finditer,
                                    token_match=nlp.tokenizer.token_match,
                                    rules=nlp.Defaults.tokenizer_exceptions)

    # spacy setup (gpu is actually faster lol)
gpu = spacy.prefer_gpu()
print(gpu)
nlp = spacy.load("en_core_web_trf")
nlp.tokenizer = custom_tokenizer(nlp)

def get_children_flatten(token, depth=0, dep=False, return_tokens=False, include_self = False):
        """recursively get children of a given token using spacy."""
        children = []
        if include_self:
            if dep:
                if return_tokens:
                    children.append(
                        (
                            token.text.lower(),
                            token.dep_,
                            token.tag_,
                            depth,
                            token.i,
                            token,
                        )
                    )
                else:
                    children.append(
                        (token.text.lower(), token.dep_, token.tag_, depth, token.i)
                    )
            else:
                children.append(token.text.lower())
        for child in token.children:
            if dep:
                if return_tokens:
                    children.append(
                        (
                            child.text.lower(),
                            child.dep_,
                            child.tag_,
                            depth,
                            child.i,
                            child,
                        )
                    )
                else:
                    children.append(
                        (child.text.lower(), child.dep_, child.tag_, depth, child.i)
                    )
            else:
                children.append(child.text.lower())
            children.extend(get_children_flatten(child, depth + 1, dep, return_tokens))
        return children

def get_phrasal_children(child):
        children_flatten = sorted(get_children_flatten(child, dep=True, include_self=True), key=lambda x: x[4])
        text = "".join([x[0] if x[0] in ["'s", "`s"] else " " + x[0] for x in children_flatten]).strip()
        i = int(children_flatten[0][4])
        return text, i

sentence = "Film critic Andrew Osmond wrote that the epilogue hurt the film's integrity for \"giving cartoon powers of survival that the film had rejected until then to Fritz \"."
for token in nlp(sentence):
    print(token.dep_)

True


  model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):


compound
compound
compound
nsubj
ROOT
mark
det
nsubj
ccomp
det
poss
case
dobj
prep
punct
pcomp
compound
dobj
prep
pobj
dobj
det
nsubj
aux
relcl
prep
pcomp
dative
pobj
punct
punct
