In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import open_clip
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import numpy as np
import random
import pytorch_lightning as pl
import argparse
import time

In [3]:
tokenizer = open_clip.get_tokenizer('ViT-B-16-SigLIP-512')


In [5]:
print(tokenizer("What is the capital of Canada?"))
print(tokenizer("I love my wife Ritsu"))

tensor([[  347,   269,   260,  1914,   267, 12706,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]])
tensor([[ 262,  302,  491,  320, 2041,  262,  331, 7962,  432,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1]])


In [5]:
tokenizer.tokenizer.save_pretrained('myExportTest')

('myExportTest\\tokenizer_config.json',
 'myExportTest\\special_tokens_map.json',
 'myExportTest\\tokenizer.json')

In [6]:
type(tokenizer.tokenizer)

transformers.models.t5.tokenization_t5_fast.T5TokenizerFast

In [13]:
# Test encoding without padding/truncation
result = tokenizer.tokenizer.encode("What is the capital of Canada?", add_special_tokens=True)
print("With special tokens:", result)
print("Length:", len(result))

# Test encoding with padding disabled
result_no_pad = tokenizer.tokenizer.encode("What is the capital of Canada?", add_special_tokens=True, padding=False)
print("No padding:", result_no_pad)
print("Length:", len(result_no_pad))


result_no_pad = tokenizer.tokenizer.encode("What is the capital of Canada?", add_special_tokens=False)
print(f"add_special_tokens = False: {result_no_pad}")
print("Length:", len(result_no_pad))

# Test what tokenizer() does (this is what you used)
result_call = tokenizer("What is the capital of Canada?")
print("tokenizer() call:", result_call)
print("tokenizer() ids:", result_call.tolist() if hasattr(result_call, 'tolist') else result_call)


With special tokens: [496, 269, 260, 1914, 267, 1717, 308, 1]
Length: 8
No padding: [496, 269, 260, 1914, 267, 1717, 308, 1]
Length: 8
add_special_tokens = False: [496, 269, 260, 1914, 267, 1717, 308]
Length: 7
tokenizer() call: tensor([[  347,   269,   260,  1914,   267, 12706,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]])
tokenizer() ids: [[347, 269, 260, 1914, 267, 12706, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [9]:
# Check what the actual tokenizer.encode() returns without any wrapper
from transformers import AutoTokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained('myExportTest')
result_direct = loaded_tokenizer.encode("What is the capital of Canada?", add_special_tokens=True)
print("Direct encode:", result_direct)
print("First 10 tokens:", result_direct[:10])

# Compare with open_clip's tokenizer
result_clip = tokenizer("What is the capital of Canada?")
print("\nOpenCLIP tokenizer result:", result_clip)
if hasattr(result_clip, 'tolist'):
    print("OpenCLIP ids (first 10):", result_clip.tolist()[:10])
elif isinstance(result_clip, dict):
    print("OpenCLIP ids (first 10):", result_clip['input_ids'][:10] if 'input_ids' in result_clip else result_clip)


Direct encode: [496, 269, 260, 1914, 267, 1717, 308, 1]
First 10 tokens: [496, 269, 260, 1914, 267, 1717, 308, 1]

OpenCLIP tokenizer result: tensor([[  347,   269,   260,  1914,   267, 12706,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]])
OpenCLIP ids (first 10): [[347, 269, 260, 1914, 267, 12706, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [10]:
# Check what tokens 347, 496, 12706, 1717, and 308 represent
loaded_tokenizer = AutoTokenizer.from_pretrained('myExportTest')
print("Token 347:", loaded_tokenizer.convert_ids_to_tokens([347]))
print("Token 496:", loaded_tokenizer.convert_ids_to_tokens([496]))
print("Token 12706:", loaded_tokenizer.convert_ids_to_tokens([12706]))
print("Token 1717:", loaded_tokenizer.convert_ids_to_tokens([1717]))
print("Token 308:", loaded_tokenizer.convert_ids_to_tokens([308]))

# Also check the full encoding to see the difference
text = "What is the capital of Canada?"
result = loaded_tokenizer.encode(text, add_special_tokens=True)
tokens = loaded_tokenizer.convert_ids_to_tokens(result)
print(f"\nFull encoding of '{text}':")
print("Token IDs:", result[:10])
print("Tokens:", tokens[:10])


Token 347: ['▁what']
Token 496: ['▁What']
Token 12706: ['▁canada']
Token 1717: ['▁Canada']
Token 308: ['?']

Full encoding of 'What is the capital of Canada?':
Token IDs: [496, 269, 260, 1914, 267, 1717, 308, 1]
Tokens: ['▁What', '▁is', '▁the', '▁capital', '▁of', '▁Canada', '?', '</s>']


In [14]:
# Test to confirm: open_clip tokenizer lowercases the input
text1 = "What is the capital of Canada?"
text2 = "what is the capital of canada?"

# Direct tokenizer encode (no preprocessing)
result1_direct = tokenizer.tokenizer.encode(text1, add_special_tokens=True)
result2_direct = tokenizer.tokenizer.encode(text2, add_special_tokens=True)
print("Direct encode('What...'):", result1_direct)
print("Direct encode('what...'):", result2_direct)
print("Are they different?", result1_direct != result2_direct)

# open_clip wrapper (with preprocessing)
result1_wrapper = tokenizer(text1)
result2_wrapper = tokenizer(text2)
print("\nWrapper('What...'):", result1_wrapper.tolist()[0][:8] if hasattr(result1_wrapper, 'tolist') else result1_wrapper)
print("Wrapper('what...'):", result2_wrapper.tolist()[0][:8] if hasattr(result2_wrapper, 'tolist') else result2_wrapper)
print("Are they the same?", (result1_wrapper.tolist()[0][:8] == result2_wrapper.tolist()[0][:8] if hasattr(result1_wrapper, 'tolist') else result1_wrapper == result2_wrapper))


Direct encode('What...'): [496, 269, 260, 1914, 267, 1717, 308, 1]
Direct encode('what...'): [347, 269, 260, 1914, 267, 12706, 308, 1]
Are they different? True

Wrapper('What...'): [347, 269, 260, 1914, 267, 12706, 1, 1]
Wrapper('what...'): [347, 269, 260, 1914, 267, 12706, 1, 1]
Are they the same? True
