In [40]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print(len(tokenizer))  # Output: 30522

30522


In [42]:
import torch
from transformers import BertTokenizer, T5Tokenizer
import tiktoken

def tokenize_bert(text):
    """Tokenize text using BERT tokenizer"""
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    tokens = tokenizer.encode(text, add_special_tokens=True)
    return tokens

def tokenize_t5(text):
    """Tokenize text using T5 tokenizer"""
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    tokens = tokenizer.encode(text, add_special_tokens=True)
    return tokens

def tokenize_fasttext(text):
    """Tokenize text using FastText"""
    # FastText uses simple space-based tokenization
    return text.split()

def tokenize_openai(text, model="gpt-3.5-turbo"):
    """Tokenize text using OpenAI's tiktoken"""
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return tokens

# Example usage
text = "Natural language processing is fascinating!"

print("BERT tokenization:")
bert_tokens = tokenize_bert(text)
print("Tokens:", bert_tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("Words:", tokenizer.convert_ids_to_tokens(bert_tokens))

# print("\nT5 tokenization:")
# t5_tokens = tokenize_t5(text)
# print("Tokens:", t5_tokens)
# tokenizer = T5Tokenizer.from_pretrained('t5-base')
# print("Words:", tokenizer.convert_ids_to_tokens(t5_tokens))

print("\nFastText tokenization:")
fasttext_tokens = tokenize_fasttext(text)
print("Tokens (words):", fasttext_tokens)

print("\nOpenAI tokenization:") 
openai_tokens = tokenize_openai(text)
print("Tokens:", openai_tokens)
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
print("Words:", encoding.decode_tokens_bytes(openai_tokens))


BERT tokenization:
Tokens: [101, 6240, 1846, 6165, 1110, 19601, 106, 102]
Words: ['[CLS]', 'meat', '後', 'rates', 'ɑ', 'facilitated', '[unused101]', '[SEP]']

FastText tokenization:
Tokens (words): ['Natural', 'language', 'processing', 'is', 'fascinating!']

OpenAI tokenization:
Tokens: [55381, 4221, 8863, 374, 27387, 0]
Words: [b'Natural', b' language', b' processing', b' is', b' fascinating', b'!']


In [44]:
def example_tokenization(text):
    print('text length (bytes):', len(text))

    words = text.split()
    avg_word_length = sum(len(word) for word in words) / len(words)
    print(f"\nAverage word length: {avg_word_length:.2f} characters ) add one for space\n")

    print("BERT tokenization:")
    bert_tokens = tokenize_bert(text)
    print("Tokens:", bert_tokens)
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    words_list_bert = tokenizer.convert_ids_to_tokens(bert_tokens)
    print("Words:", words_list_bert)
    print(f'{len(bert_tokens)} tokens, about {2*len(bert_tokens)} bytes')

    print("\nOpenAI tokenization:") 
    openai_tokens = tokenize_openai(text)
    print("Tokens:", openai_tokens)
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    words_list_openai = encoding.decode_tokens_bytes(openai_tokens)
    print("Words:", words_list_openai)
    print(f'{len(openai_tokens)} tokens, about {3*len(openai_tokens)} bytes')


    return words_list_bert, words_list_openai
    


In [12]:
# Example usage
text = "take me to the best restorant in town"
example_tokenization(text)

text length (bytes): 37 

BERT tokenization:
Tokens: [101, 2202, 2033, 2000, 1996, 2190, 2717, 18842, 2102, 1999, 2237, 102]
Words: ['[CLS]', 'take', 'me', 'to', 'the', 'best', 'rest', '##oran', '##t', 'in', 'town', '[SEP]']
12 tokens, about 24 bytes

FastText tokenization:
Tokens (words): ['take', 'me', 'to', 'the', 'best', 'restorant', 'in', 'town']
8

OpenAI tokenization:
Tokens: [23609, 757, 311, 279, 1888, 2800, 269, 519, 304, 6424]
Words: [b'take', b' me', b' to', b' the', b' best', b' rest', b'or', b'ant', b' in', b' town']
10


In [13]:
# Example usage
text = "Supercalifrajelisticexpialidoshes"
example_tokenization(text)

text length (bytes): 33 

BERT tokenization:
Tokens: [101, 3565, 9289, 10128, 14220, 29282, 4588, 10288, 19312, 21273, 17369, 2229, 102]
Words: ['[CLS]', 'super', '##cal', '##if', '##raj', '##elis', '##tic', '##ex', '##pia', '##lid', '##osh', '##es', '[SEP]']
13 tokens, about 26 bytes

FastText tokenization:
Tokens (words): ['Supercalifrajelisticexpialidoshes']
1

OpenAI tokenization:
Tokens: [10254, 3035, 278, 333, 56486, 301, 4633, 4683, 532, 307, 9451, 288]
Words: [b'Sup', b'erc', b'al', b'if', b'raj', b'el', b'istic', b'exp', b'ial', b'id', b'osh', b'es']
12


In [14]:
# Example usage
text = "These models do not use a fixed word vocabulary, but rather a token vocabulary (based on subwords using BPE or WordPiece)."
example_tokenization(text)

text length (bytes): 122 

BERT tokenization:
Tokens: [101, 2122, 4275, 2079, 2025, 2224, 1037, 4964, 2773, 16188, 1010, 2021, 2738, 1037, 19204, 16188, 1006, 2241, 2006, 4942, 22104, 2478, 17531, 2063, 2030, 2773, 11198, 1007, 1012, 102]
Words: ['[CLS]', 'these', 'models', 'do', 'not', 'use', 'a', 'fixed', 'word', 'vocabulary', ',', 'but', 'rather', 'a', 'token', 'vocabulary', '(', 'based', 'on', 'sub', '##words', 'using', 'bp', '##e', 'or', 'word', '##piece', ')', '.', '[SEP]']
30 tokens, about 60 bytes

FastText tokenization:
Tokens (words): ['These', 'models', 'do', 'not', 'use', 'a', 'fixed', 'word', 'vocabulary,', 'but', 'rather', 'a', 'token', 'vocabulary', '(based', 'on', 'subwords', 'using', 'BPE', 'or', 'WordPiece).']
21

OpenAI tokenization:
Tokens: [9673, 4211, 656, 539, 1005, 264, 8521, 3492, 36018, 11, 719, 4856, 264, 4037, 36018, 320, 31039, 389, 1207, 5880, 1701, 426, 1777, 477, 9506, 32309, 570]
Words: [b'These', b' models', b' do', b' not', b' use', b' a', b' fixed', 

In [15]:

text = "A Forecasting competition for the Advanced Machine Learning course, which is part of the M.Sc. degree at Reichman University"
example_tokenization(text)

text length (bytes): 128 

BERT tokenization:
Tokens: [101, 1037, 19939, 2075, 2971, 2005, 1996, 3935, 3698, 4083, 2607, 1010, 2029, 2003, 2112, 1997, 1996, 1049, 1012, 8040, 1012, 3014, 2012, 14365, 2386, 2118, 102]
Words: ['[CLS]', 'a', 'forecast', '##ing', 'competition', 'for', 'the', 'advanced', 'machine', 'learning', 'course', ',', 'which', 'is', 'part', 'of', 'the', 'm', '.', 'sc', '.', 'degree', 'at', 'reich', '##man', 'university', '[SEP]']
27 tokens, about 54 bytes

FastText tokenization:
Tokens (words): ['A', 'Forecasting', 'competition', 'for', 'the', 'Advanced', 'Machine', 'Learning', 'course,', 'which', 'is', 'part', 'of', 'the', 'M.Sc.', 'degree', 'at', 'Reichman', 'University']
19

OpenAI tokenization:
Tokens: [262, 362, 56775, 287, 10937, 369, 279, 21844, 13257, 21579, 3388, 11, 902, 374, 961, 315, 279, 386, 18832, 13, 8547, 520, 51659, 1543, 3907]
Words: [b'   ', b' A', b' Forecast', b'ing', b' competition', b' for', b' the', b' Advanced', b' Machine', b' Learning', b'

In [16]:
# Calculate average word length
text = "A Forecasting competition for the Advanced Machine Learning course, which is part of the M.Sc. degree at Reichman University"
words = text.split()
avg_word_length = sum(len(word) for word in words) / len(words)
print(f"\nAverage word length: {avg_word_length:.2f} characters")



Average word length: 5.58 characters


In [45]:
text = """
Dec 25, 2024: Nike has launched its first ever collaboration with an Indian fashion label, unveiling a range of patterned sportswear inspired by the country’s ancient tie-dying techniques.
Created alongside Delhi-based brand NorBlack NorWhite, the colorful footwear and apparel collection “invites women into sport” while celebrating “Indian culture and craftsmanship,” according to a Nike press release.
The brands’ new campaign, unveiled this week, features Indian cricketers Jemimah Rodrigues and Shafali Verma, alongside wrestler Anshu Malik and sprinter Priya Mohan. Accompanying images, shot by celebrated fashion photographer Bharat Sikka, show the female athletes posing in the historic city of Jaipur — including on its iconic stepwells.
The move signals Nike’s renewed ambitions for a market in which it has experienced mixed fortunes since entering, via a licensing deal, in 1995. After established a wholly owned subsidiary nine years later, the company gambled heavily on the country’s most popular sport, cricket, beating rivals Adidas and Reebok to a 1.97-billion-rupee (then $44 million) deal to outfit the Indian national team in 2005.
But Nike has since struggled to make commercial inroads, with local media reporting in 2019 that it had slashed the number of stores in India to 150, down from a peak of 350. The brand’s website directory currently lists just 93 stores in India, compared to more than 2,600 in mainland China, a market of comparable population.
The sportwear giant also appears to be pushing its women’s apparel business, which has traditionally lagged behind its menswear. Several recent campaigns have spotlighted female athletes, with this year’s Super Bowl ad featuring WNBA star Caitlin Clark and Olympic sprinter Sha’Carri Richardson.
"""
bert_words, openai_words = example_tokenization(text)

text length (bytes): 1778

Average word length: 5.63 characters ) add one for space

BERT tokenization:
Tokens: [101, 13063, 1512, 117, 17881, 1527, 131, 20100, 1144, 2536, 1157, 1148, 1518, 5294, 1114, 1126, 1890, 4633, 3107, 117, 8362, 2707, 10689, 170, 2079, 1104, 4844, 1174, 2865, 14719, 3768, 1118, 1103, 1583, 787, 188, 2890, 5069, 118, 5694, 4884, 119, 25423, 3338, 6175, 118, 1359, 4097, 16162, 2064, 1742, 2158, 16162, 2924, 17481, 1162, 117, 1103, 15302, 2555, 14719, 1105, 12647, 24971, 2436, 789, 20384, 1535, 1154, 4799, 790, 1229, 14118, 789, 1890, 2754, 1105, 22009, 15924, 117, 790, 2452, 1106, 170, 20100, 3181, 1836, 119, 1109, 10915, 787, 1207, 2322, 117, 11770, 1142, 1989, 117, 1956, 1890, 9469, 1116, 27901, 3080, 26363, 11945, 17305, 10589, 1105, 156, 2328, 8057, 2646, 159, 1200, 1918, 117, 3338, 11113, 1760, 16138, 15147, 1105, 24360, 153, 16383, 20165, 119, 138, 14566, 8223, 18266, 1158, 4351, 117, 2046, 1118, 5719, 4633, 8152, 139, 10131, 1204, 14159, 19610, 117, 1437,

In [30]:
# Convert bytes to strings in openai_words list
openai_words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in openai_words]
openai_words_string = ''.join(openai_words)
openai_words_string

'\nDec 25, 2024: Nike has launched its first ever collaboration with an Indian fashion label, unveiling a range of patterned sportswear inspired by the country’s ancient tie-dying techniques.\nCreated alongside Delhi-based brand NorBlack NorWhite, the colorful footwear and apparel collection “invites women into sport” while celebrating “Indian culture and craftsmanship,” according to a Nike press release.\nThe brands’ new campaign, unveiled this week, features Indian cricketers Jemimah Rodrigues and Shafali Verma, alongside wrestler Anshu Malik and sprinter Priya Mohan. Accompanying images, shot by celebrated fashion photographer Bharat Sikka, show the female athletes posing in the historic city of Jaipur — including on its iconic stepwells.\nThe move signals Nike’s renewed ambitions for a market in which it has experienced mixed fortunes since entering, via a licensing deal, in 1995. After established a wholly owned subsidiary nine years later, the company gambled heavily on the count

In [32]:
text == openai_words_string


True

In [50]:
# Join BERT words, handling special '##' prefix
bert_text = ""
for i, word in enumerate(bert_words):
    if word == '[CLS]':
        continue
    if word == '[SEP]':
        continue
    if isinstance(word, bytes):
        word = word.decode('utf-8')
    if word.startswith('##'):
        bert_text += word[2:]  # Remove '##' and append directly
    else:
        if i > 0:  # Add space before word unless it's the first word
            bert_text += " "
        bert_text += word

print(f"\nOriginal text equals BERT reconstructed text: {text == bert_text}")







Original text equals BERT reconstructed text: False


In [56]:
stripped_text = text.replace(' ', '').replace('\n', '')
stripped_bert_text = bert_text.replace(' ', '').replace('\n', '')
print(stripped_text == stripped_bert_text)
print(stripped_text)
print(stripped_bert_text)






True
Dec25,2024:NikehaslauncheditsfirstevercollaborationwithanIndianfashionlabel,unveilingarangeofpatternedsportswearinspiredbythecountry’sancienttie-dyingtechniques.CreatedalongsideDelhi-basedbrandNorBlackNorWhite,thecolorfulfootwearandapparelcollection“inviteswomenintosport”whilecelebrating“Indiancultureandcraftsmanship,”accordingtoaNikepressrelease.Thebrands’newcampaign,unveiledthisweek,featuresIndiancricketersJemimahRodriguesandShafaliVerma,alongsidewrestlerAnshuMalikandsprinterPriyaMohan.Accompanyingimages,shotbycelebratedfashionphotographerBharatSikka,showthefemaleathletesposinginthehistoriccityofJaipur—includingonitsiconicstepwells.ThemovesignalsNike’srenewedambitionsforamarketinwhichithasexperiencedmixedfortunessinceentering,viaalicensingdeal,in1995.Afterestablishedawhollyownedsubsidiarynineyearslater,thecompanygambledheavilyonthecountry’smostpopularsport,cricket,beatingrivalsAdidasandReeboktoa1.97-billion-rupee(then$44million)dealtooutfittheIndiannationalteamin2005.ButNikehass

In [63]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
vocab_items = list(tokenizer.vocab.items())
first_200 = vocab_items[13063 : 13063 + 200]

# Print each token and its ID
for token, token_id in first_200:
    print(f"{token_id}: {token}")


13063: Dec
13064: ##aneous
13065: chambers
13066: Color
13067: Gus
13068: ##site
13069: Alternative
13070: ##world
13071: Exeter
13072: Omaha
13073: celebrities
13074: striker
13075: 210
13076: dwarf
13077: meals
13078: Oriental
13079: Pearson
13080: financing
13081: revenues
13082: underwater
13083: Steele
13084: screw
13085: Feeling
13086: Mt
13087: acids
13088: badge
13089: swore
13090: theaters
13091: Moving
13092: admired
13093: lung
13094: knot
13095: penalties
13096: 116
13097: fork
13098: ##cribed
13099: Afghan
13100: outskirts
13101: Cambodia
13102: oval
13103: wool
13104: fossils
13105: Ned
13106: Countess
13107: Darkness
13108: delicious
13109: ##nica
13110: Evelyn
13111: Recordings
13112: guidelines
13113: ##CP
13114: Sandra
13115: meantime
13116: Antarctica
13117: modeling
13118: granddaughter
13119: ##rial
13120: Roma
13121: Seventh
13122: Sunshine
13123: Gabe
13124: ##nton
13125: Shop
13126: Turks
13127: prolific
13128: soup
13129: parody
13130: ##nta
13131: Judith
13132

In [65]:
# Load BERT model
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-cased')

# Get the token ID for the word "word"
word_token = tokenizer.encode("word", add_special_tokens=False)[0]
print(f"\nToken ID for 'word': {word_token}")

# Get the embedding for this token from the model's embedding layer
word_embedding = model.embeddings.word_embeddings(torch.tensor([word_token]))
print(f"\nEmbedding vector for 'word' (shape {word_embedding.shape}):")
print(word_embedding)



ImportError: Traceback (most recent call last):
  File "c:\Users\Me\anaconda3\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

lamplezip
wordpiece

positive vs Negative