In [1]:
import torch
import re

### Read the words list and convert into a unique_words python list

In [2]:
# Define the path to your text file
doc_name = "the_return_of_sherlock_holmes"

file_path = f"{doc_name}/{doc_name}_words.txt"

# Initialize an empty list to hold the lines of the file
lines_list = []

# Open the file for reading ('r')
with open(file_path, 'r', encoding='utf-8') as file:
    # Read all lines in the file and add them to the list
    lines_list = file.readlines()

# Now lines_list contains all the lines from the file
# If you want to remove newline characters from the end of each line, you can do:
unique_words = [line.strip() for line in lines_list]


### Lets concatenate all the unique words into a list_as_text variable. You could work directly with the python list, though.

In [3]:
list_as_text = ' '.join(unique_words)

In [4]:
list_as_text



### Lets define our special tokens:

In [5]:
start_token = '[S]'
end_token = '[EOS]'
unknown_token = '[UNK]'
pad_token = '[PAD]'  # Add the PAD token

### Our tokenizer will need a function that tokenizes the inputs given to it. So lets build our tokenizer function:

In [6]:
def tokenize(text):
    # Use regular expression to separate words from periods and commas
    return re.findall(r"[\w']+|[.,!?]", text.lower())

In [8]:
text = "This is a text of our tokenizer."

tokenized_text = tokenize(text)

print(tokenized_text)

['this', 'is', 'a', 'text', 'of', 'our', 'tokenizer', '.']


### Lets create now our unique tokens list using the list_as_text and the tokenizer function. We should also combine here our special tokens

In [9]:
tokens = tokenize(list_as_text)

In [10]:
unique_tokens = list(set(tokens))

In [11]:

# Sort tokens to ensure consistent indexing (except for special tokens)
unique_tokens.sort()

In [12]:
# Ensure special tokens are at the beginning
unique_tokens = [token for token in unique_tokens if token not in (start_token, end_token, unknown_token, pad_token)] + \
                [start_token, end_token, unknown_token, pad_token]

In [13]:
# Print the first 10 elements
print("First 10 tokens:", unique_tokens[:10])

# Print the last 10 elements
print("Last 10 tokens:", unique_tokens[-10:])


First 10 tokens: ['a', 'ab', 'aback', 'abandon', 'abandoned', 'abandoning', 'abbey', 'abbeys', 'abduc', 'abduction']
Last 10 tokens: ['zealous', 'zenith', 'zest', 'zied', 'zled', 'zoo', '[S]', '[EOS]', '[UNK]', '[PAD]']


### Next, lets assign an index to each token and build our helper functions to go from token to id, and from id to token

In [14]:
word2idx = {token: idx for idx, token in enumerate(unique_tokens)}

In [15]:
# Example:
id = word2idx["rose"]
print(id)

6763


In [16]:

idx2word = {idx: token for token, idx in word2idx.items()}

In [17]:
# Example:
word = idx2word[6763]
print(word)

rose


### While working with our tokenizer we will need to use the IDs of the special tokens, so lets define them here:

In [18]:
start_token_id = word2idx[start_token]  # Save the index of the PAD token
end_token_id = word2idx[end_token]  # Save the index of the PAD token
unknown_token_id = word2idx[unknown_token]  # Save the index of the PAD token
pad_token_id = word2idx[pad_token]

In [19]:
print(f"start_token:   {start_token}   - Id: {start_token_id}")
print(f"end_token:     {end_token} - Id: {end_token_id}")
print(f"unknown_token: {unknown_token} - Id: {unknown_token_id}")
print(f"pad_token:     {pad_token} - Id: {pad_token_id}")

start_token:   [S]   - Id: 9088
end_token:     [EOS] - Id: 9089
unknown_token: [UNK] - Id: 9090
pad_token:     [PAD] - Id: 9091


### Now the ENCODE function

In [20]:
def encode(text, max_length=None, truncation=False, return_tensors=False):
    """
    Encode the text into a sequence of token IDs, with optional truncation.

    Parameters:
    - text: The text to encode.
    - max_length: The maximum length of the token sequence after encoding.
    - truncation: Whether to truncate the sequence to max_length.

    Returns:
    - A list of token IDs representing the encoded text.
    """
    tokens = tokenize(text)
    encoded_tokens = [word2idx.get(token, word2idx[unknown_token]) for token in tokens]

    # Prepend the start token ID and append the end token ID
    encoded_tokens = [word2idx[start_token]] + encoded_tokens + [word2idx[end_token]]

    # Handle truncation
    if truncation and max_length is not None:
        # Truncate the sequence if it's longer than max_length
        encoded_tokens = encoded_tokens[:max_length - 1] + [word2idx[end_token]]

    # Convert to tensor if return_tensors is True
    if return_tensors:
        encoded_tokens = torch.tensor([encoded_tokens])  # Adding a batch dimension

    return encoded_tokens

In [21]:
text = "Red roses mean love."
encoded_tokens = encode(text)
encoded_tokens

[9088, 6473, 9090, 4930, 4741, 9090, 9089]

### The DECODE function

In [22]:
def decode(indices, skip_special_tokens=False):
    # Ensure indices is a list of integers, not a list of tensors
    if isinstance(indices, torch.Tensor):
        indices = indices.tolist()

    # Define a set of all special token ids you want to skip
    special_token_ids = set()
    if skip_special_tokens:
        special_token_ids.update([
            start_token_id,
            end_token_id,
            unknown_token_id,
            pad_token_id,
            # Add any other special token ids you have
        ])

    # Use a list comprehension to filter out all special tokens
    tokens = [idx2word[idx] for idx in indices if idx not in special_token_ids]

    # Join the tokens into a single string with spaces
    return ' '.join(tokens)

In [23]:
decoded_text = decode(encoded_tokens)
decoded_text

'[S] red [UNK] mean love [UNK] [EOS]'

In [25]:
decoded_text = decode(encoded_tokens,skip_special_tokens=True)
decoded_text

'red mean love'

### To make our tokenizer more compatible with the one from Huggingface, lets add these additional functions:

In [26]:
def convert_ids_to_tokens(token_ids):
    """
    Convert a list of token IDs to their corresponding tokens.

    Parameters:
    - token_ids: A list of integers representing token IDs.

    Returns:
    - tokens: A list of string tokens corresponding to the input IDs.
    """
    tokens = [idx2word.get(token_id, unknown_token) for token_id in token_ids]

    return tokens

In [27]:
def convert_tokens_to_ids(tokens):
    """
    Convert a list of tokens to their corresponding token IDs.

    Parameters:
    - tokens: A list of string tokens.

    Returns:
    - token_ids: A list of integers representing the token IDs.
    """
    token_ids = [word2idx.get(token, unknown_token_id) for token in tokens]

    return token_ids

# Putting everything together:

In [28]:
class SimpleTokenizer:
    def __init__(self, text):
        # Define special tokens
        self.start_token = '[S]'
        self.end_token = '[EOS]'
        self.unknown_token = '[UNK]'
        self.pad_token = '[PAD]'  # Add the PAD token

        # Tokenize the input text and include special tokens
        tokens = self.tokenize(text)
        unique_tokens = list(set(tokens))

        # Sort tokens to ensure consistent indexing (except for special tokens)
        unique_tokens.sort()
        # Ensure special tokens are at the beginning
        unique_tokens = [token for token in unique_tokens if token not in (self.start_token, self.end_token, self.unknown_token, self.pad_token)] + \
                        [self.start_token, self.end_token, self.unknown_token, self.pad_token]

        # Assign an index to each unique token
        self.word2idx = {token: idx for idx, token in enumerate(unique_tokens)}
        self.idx2word = {idx: token for token, idx in self.word2idx.items()}

        self.start_token_id = self.word2idx[self.start_token]  # Save the index of the PAD token
        self.end_token_id = self.word2idx[self.end_token]  # Save the index of the PAD token
        self.unknown_token_id = self.word2idx[self.unknown_token]  # Save the index of the PAD token
        self.pad_token_id = self.word2idx[self.pad_token]

    def tokenize(self, text):
        # Use regular expression to separate words from periods and commas
        return re.findall(r"[\w']+|[.,!?]", text.lower())

    def encode(self, text, max_length=None, truncation=False, return_tensors=False):
        """
        Encode the text into a sequence of token IDs, with optional truncation.

        Parameters:
        - text: The text to encode.
        - max_length: The maximum length of the token sequence after encoding.
        - truncation: Whether to truncate the sequence to max_length.

        Returns:
        - A list of token IDs representing the encoded text.
        """
        tokens = self.tokenize(text)
        encoded_tokens = [self.word2idx.get(token, self.word2idx[self.unknown_token]) for token in tokens]

        # Prepend the start token ID and append the end token ID
        encoded_tokens = [self.word2idx[self.start_token]] + encoded_tokens + [self.word2idx[self.end_token]]

        # Handle truncation
        if truncation and max_length is not None:
            # Truncate the sequence if it's longer than max_length
            encoded_tokens = encoded_tokens[:max_length - 1] + [self.word2idx[self.end_token]]

        # Convert to tensor if return_tensors is True
        if return_tensors:
            encoded_tokens = torch.tensor([encoded_tokens])  # Adding a batch dimension

        return encoded_tokens

    def decode(self, indices, skip_special_tokens=False):
        # Ensure indices is a list of integers, not a list of tensors
        if isinstance(indices, torch.Tensor):
            indices = indices.tolist()

        # Define a set of all special token ids you want to skip
        special_token_ids = set()
        if skip_special_tokens:
            special_token_ids.update([
                self.start_token_id,
                self.end_token_id,
                self.unknown_token_id,
                self.pad_token_id,
                # Add any other special token ids you have
            ])

        # Use a list comprehension to filter out all special tokens
        tokens = [self.idx2word[idx] for idx in indices if idx not in special_token_ids]

        # Join the tokens into a single string with spaces
        return ' '.join(tokens)

    def convert_ids_to_tokens(self, token_ids):
        """
        Convert a list of token IDs to their corresponding tokens.

        Parameters:
        - token_ids: A list of integers representing token IDs.

        Returns:
        - tokens: A list of string tokens corresponding to the input IDs.
        """
        tokens = [self.idx2word.get(token_id, self.unknown_token) for token_id in token_ids]

        return tokens

    def convert_tokens_to_ids(self, tokens):
        """
        Convert a list of tokens to their corresponding token IDs.

        Parameters:
        - tokens: A list of string tokens.

        Returns:
        - token_ids: A list of integers representing the token IDs.
        """
        token_ids = [self.word2idx.get(token, self.unknown_token_id) for token in tokens]

        return token_ids

## Testing our tokenizer

In [29]:

# Usage example
tokenizer = SimpleTokenizer(list_as_text)
encoded = tokenizer.encode("color ? A rose is red.")
decoded = tokenizer.decode(encoded)

print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")
print(f"PAD token ID: {tokenizer.pad_token_id}")


Encoded: [9088, 9090, 9090, 0, 6763, 4269, 6473, 9090, 9089]
Decoded: [S] [UNK] [UNK] a rose is red [UNK] [EOS]
PAD token ID: 9091
