In [19]:
import pandas as pd

## Character-level tokenization

In [41]:
text = "Tokenizing text is a core task of NLP."
tokenized_text = list(text)
print(tokenized_text)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', '.', 'L', 'N', 'P', 'T', 'a', 'a', 'c', 'e', 'e', 'e', 'f', 'g', 'i', 'i', 'i', 'k', 'k', 'n', 'n', 'o', 'o', 'o', 'r', 's', 's', 't', 't', 't', 'x', 'z']


## Numericalization 
- each character needs to be converted to an integer


In [9]:
# Based on index postion, use enumerate
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)
             

{' ': 0, '.': 1, 'L': 2, 'N': 3, 'P': 4, 'T': 5, 'a': 6, 'c': 7, 'e': 8, 'f': 9, 'g': 10, 'i': 11, 'k': 12, 'n': 13, 'o': 14, 'r': 15, 's': 16, 't': 17, 'x': 18, 'z': 19}


In [95]:
# Based on Unicode, use ord
token2uni = {ch: ord(ch) for ch in set(tokenized_text)}
print(token2uni)

{'c': 99, 'e': 101, 'r': 114, 'g': 103, 'i': 105, 'k': 107, 'n': 110, 'P': 80, 'T': 84, 's': 115, 'a': 97, 'N': 78, 'x': 120, '.': 46, 'L': 76, ' ': 32, 'f': 102, 'o': 111, 'z': 122, 't': 116}


## Transform the tokenized text to a list of integers

### Based on index postion

In [173]:
# Based on index postion
idx_id = []

for ch in text:
    idx_id.append(token2idx[ch])

print(idx_id)
print(len(idx_id))
print(max(idx_id))


[5, 14, 12, 8, 13, 11, 19, 11, 13, 10, 0, 17, 8, 18, 17, 0, 11, 16, 0, 6, 0, 7, 14, 15, 8, 0, 17, 6, 16, 12, 0, 14, 9, 0, 3, 2, 4, 1]
38
19


### Based on Unicode assignment

In [193]:
# Sorted list of Unicode characters
sorted_token2uni_list = sorted(list(token2uni))
print(token2uni_list)
# Sorted list of Unicode values
sorted_uni_values = sorted(set(ord(ch) for ch in tokenized_text))
print(sorted_uni_values)

[' ', '.', 'L', 'N', 'P', 'T', 'a', 'c', 'e', 'f', 'g', 'i', 'k', 'n', 'o', 'r', 's', 't', 'x', 'z']
[32, 46, 76, 78, 80, 84, 97, 99, 101, 102, 103, 105, 107, 110, 111, 114, 115, 116, 120, 122]


In [175]:
# based on unicode
uni_id = []

for ch in text:
    value = token2uni.get(ch)
    uni_id.append(value)

print(uni_id)
print(len(uni_id))
print(max(uni_id))
 

[84, 111, 107, 101, 110, 105, 122, 105, 110, 103, 32, 116, 101, 120, 116, 32, 105, 115, 32, 97, 32, 99, 111, 114, 101, 32, 116, 97, 115, 107, 32, 111, 102, 32, 78, 76, 80, 46]
38
122


## Convert list of integers into 2D tensors with One Hot Encoding

- Each row will be a one-hot vector of mostly 0s with a single 1 representing a character.
- Each character from the original string is represented by a unique row vector.

In [143]:
pip install torch torchvision

Collecting torch
  Downloading torch-2.7.0-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.7.0-cp312-none-macosx_11_0_arm64.whl (68.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m0m
[?25hDownloading torchvision-0.22.0-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: sympy, torch, torchvision
  Attempting uninstall: s

In [153]:
import torch
import torch.nn.functional as F


In [159]:
idx_id = torch.tensor(idx_id)
idx_encoded = F.one_hot(idx_id, num_classes=len(token2idx))
idx_encoded.shape

torch.Size([38, 20])

In [215]:
print(f"Token: {tokenized_text[11]}")
print(f"Tensor index: {idx_id[11]}")
print(f"One-hot: {idx_encoded[11]}")

Token: T
Tensor index: 17
One-hot: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])


In [203]:
# Map the unicode assignment to 0-based indexing assignment
uni2idx = {val: idx for idx, val in enumerate(sorted_uni_values)}
print(uni2idx)
# Convert uni_id to remapped index (use sorted_uni_values from prvious block)
uni_idx_id = [uni2idx[val] for val in uni_id]
print(uni_idx_id)

{32: 0, 46: 1, 76: 2, 78: 3, 80: 4, 84: 5, 97: 6, 99: 7, 101: 8, 102: 9, 103: 10, 105: 11, 107: 12, 110: 13, 111: 14, 114: 15, 115: 16, 116: 17, 120: 18, 122: 19}
[5, 14, 12, 8, 13, 11, 19, 11, 13, 10, 0, 17, 8, 18, 17, 0, 11, 16, 0, 6, 0, 7, 14, 15, 8, 0, 17, 6, 16, 12, 0, 14, 9, 0, 3, 2, 4, 1]


In [205]:
uni_idx_id = torch.tensor(uni_idx_id)
uni_encoded = F.one_hot(uni_idx_id, num_classes=len(uni2idx))
uni_encoded.shape

torch.Size([38, 20])

In [213]:
print(f"Token: {tokenized_text[11]}")
print(f"Tensor index: {uni_id[11]}")
print(f"One-hot: {uni_encoded[11]}")

Token: T
Tensor index: 116
One-hot: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])


In [211]:
print(f"One-hot: {uni_encoded[1]}")

One-hot: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])


In [234]:
print(uni_encoded[5:10])
print('------------------------------------------------------------------------')
print(idx_encoded[5:10])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
------------------------------------------------------------------------
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


Reference:
https://medium.com/@abdallahashraf90x/tokenization-in-nlp-all-you-need-to-know-45c00cfa2df7