In [1]:
from typing import List

def tokenize_on_whitespace(text_list: List[str]) -> List[List[str]]:
    """
    Tokenize a list of strings, splitting each string into tokens based on whitespace characters.

    Args:
    text_list (List[str]): A list of strings to be tokenized.

    Returns:
    List[List[str]]: A list of lists, where each sublist contains tokens from the corresponding input string.
    """
    return [text.split() for text in text_list]

# Example usage:
input_texts = ["Whitespace Tokenization", "Splitting text into tokens based on whitespace characters"]
tokenized_texts = tokenize_on_whitespace(input_texts)

print("Tokenized Texts:")
for i, tokens in enumerate(tokenized_texts):
    print(f"Input {i+1}: {input_texts[i]}")
    print(f"Tokens: {tokens}")


Tokenized Texts:
Input 1: Whitespace Tokenization
Tokens: ['Whitespace', 'Tokenization']
Input 2: Splitting text into tokens based on whitespace characters
Tokens: ['Splitting', 'text', 'into', 'tokens', 'based', 'on', 'whitespace', 'characters']


In [2]:
import re
from typing import List, Pattern

# Define a pattern for matching punctuation
PUNCTUATION_PATTERN: Pattern = re.compile(r'[^\w\s]')

def tokenize_on_punctuation(text_list: List[str]) -> List[List[str]]:
    """
    Tokenize a list of strings, splitting each string into tokens based on punctuation marks.

    Args:
        text_list (List[str]): A list of strings to be tokenized.

    Returns:
        List[List[str]]: A list of lists, where each sublist contains tokens from the corresponding input string.
    """
    # Split each string by matching punctuation
    return [re.split(PUNCTUATION_PATTERN, text) for text in text_list]

# Example usage:
input_texts = [
    "Punctuation Tokenization",
    "Splitting text into tokens based on punctuation marks"
]
tokenized_texts = tokenize_on_punctuation(input_texts)

print("Tokenized Texts:")
for i, tokens in enumerate(tokenized_texts):
    print(f"Input {i+1}: {input_texts[i]}")
    print(f"Tokens: {tokens}")


Tokenized Texts:
Input 1: Punctuation Tokenization
Tokens: ['Punctuation Tokenization']
Input 2: Splitting text into tokens based on punctuation marks
Tokens: ['Splitting text into tokens based on punctuation marks']


In [3]:
import re
from typing import List, Pattern

def regex_tokenizer(text: str, pattern: Pattern[str]) -> List[str]:
    """
    Tokenizes the given text using a regular expression pattern.

    :param text: The text to be tokenized.
    :param pattern: The compiled regular expression pattern used for tokenization.
    :return: A list of tokens.
    """
    # Tokenize the text using the provided pattern
    tokens = re.findall(pattern, text)

    return tokens

# Example usage
if __name__ == "__main__":
    # Define the input text
    user_input_text = "The quick brown fox jumps over the lazy dog. It's amazing!"

    # Define a regular expression pattern for tokenization
    # This example pattern splits on whitespace and punctuation
    regex_pattern = r'\b\w+\b'

    # Compile the regular expression pattern for efficiency
    compiled_pattern = re.compile(regex_pattern)

    # Tokenize the user input text using the compiled pattern
    tokens = regex_tokenizer(user_input_text, compiled_pattern)

    # Output the result
    print("Tokens:", tokens)

Tokens: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'It', 's', 'amazing']


In [4]:
import re
from typing import List


def regex_tokenize(text: str, pattern: str) -> List[str]:
    """
    Tokenize the given text using a regular expression pattern.

    Args:
        text (str): The input text to be tokenized.
        pattern (str): The regular expression pattern used for tokenization.

    Returns:
        List[str]: A list of tokens extracted from the text.
    """
    tokens = re.findall(pattern, text)
    return tokens


def main():
    # Example usage
    text = "Hello, world! This is a sample text. It contains 123 numbers and punctuation."
    pattern = r"\w+|[^\w\s]"

    tokens = regex_tokenize(text, pattern)
    print("Original text:")
    print(text)
    print("\nTokens:")
    print(tokens)


if __name__ == "__main__":
    main()

Original text:
Hello, world! This is a sample text. It contains 123 numbers and punctuation.

Tokens:
['Hello', ',', 'world', '!', 'This', 'is', 'a', 'sample', 'text', '.', 'It', 'contains', '123', 'numbers', 'and', 'punctuation', '.']


In [7]:
from collections import defaultdict
from typing import List, Tuple


def get_byte_pairs(tokens: List[str]) -> List[Tuple[str, str]]:
    """
    Get all byte pairs from the given list of tokens.

    Args:
        tokens (List[str]): A list of tokens.

    Returns:
        List[Tuple[str, str]]: A list of byte pairs.
    """
    byte_pairs = []
    for token in tokens:
        chars = list(token)
        for i in range(len(chars) - 1):
            byte_pairs.append((chars[i], chars[i + 1]))
    return byte_pairs


def merge_byte_pairs(tokens: List[str], byte_pair: Tuple[str, str]) -> List[str]:
    """
    Merge the given byte pair in the list of tokens.

    Args:
        tokens (List[str]): A list of tokens.
        byte_pair (Tuple[str, str]): The byte pair to merge.

    Returns:
        List[str]: A list of tokens with the byte pair merged.
    """
    merged_tokens = []
    for token in tokens:
        merged_token = token.replace(byte_pair[0] + byte_pair[1], byte_pair[0] + "_" + byte_pair[1])
        merged_tokens.append(merged_token)
    return merged_tokens


def bpe_tokenize(text: str, num_merges: int) -> List[str]:
    """
    Tokenize the given text using Byte Pair Encoding (BPE).

    Args:
        text (str): The input text to be tokenized.
        num_merges (int): The number of merge operations to perform.

    Returns:
        List[str]: A list of tokens after applying BPE.
    """
    tokens = list(text)
    for _ in range(num_merges):
        byte_pairs = get_byte_pairs(tokens)
        pair_frequencies = defaultdict(int)
        for pair in byte_pairs:
            pair_frequencies[pair] += 1
        if not pair_frequencies:
            break
        most_frequent_pair = max(pair_frequencies, key=pair_frequencies.get)
        tokens = merge_byte_pairs(tokens, most_frequent_pair)
    return tokens


def main():
    # Example usage
    text = "Hello, world! This is a sample text."
    num_merges = 5

    tokens = bpe_tokenize(text, num_merges)
    print("Original text:")
    print(text)
    print("\nTokens after BPE:")
    print(tokens)


if __name__ == "__main__":
    main()

Original text:
Hello, world! This is a sample text.

Tokens after BPE:
['H', 'e', 'l', 'l', 'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', '!', ' ', 'T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'a', 'm', 'p', 'l', 'e', ' ', 't', 'e', 'x', 't', '.']


In [8]:
from typing import List, Dict


class WordPieceTokenizer:
    def __init__(self, vocab: Dict[str, int], unk_token: str = "[UNK]", max_input_chars_per_word: int = 100):
        """
        Initialize the WordPieceTokenizer.

        Args:
            vocab (Dict[str, int]): The vocabulary dictionary mapping subwords to their indices.
            unk_token (str): The token to use for unknown subwords. Default is "[UNK]".
            max_input_chars_per_word (int): The maximum number of characters to consider for each word. Default is 100.
        """
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize the given text using WordPiece Tokenization.

        Args:
            text (str): The input text to be tokenized.

        Returns:
            List[str]: A list of subword tokens.
        """
        output_tokens = []
        for token in text.split():
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens


def main():
    # Example usage
    vocab = {
        "[UNK]": 0,
        "[CLS]": 1,
        "[SEP]": 2,
        "want": 3,
        "##want": 4,
        "##ed": 5,
        "wa": 6,
        "un": 7,
        "runn": 8,
        "##ing": 9,
        ",": 10,
        "low": 11,
        "lowest": 12,
    }

    text = "unwanted, running"
    tokenizer = WordPieceTokenizer(vocab)
    tokens = tokenizer.tokenize(text)
    print("Original text:")
    print(text)
    print("\nTokens after WordPiece Tokenization:")
    print(tokens)


if __name__ == "__main__":
    main()

Original text:
unwanted, running

Tokens after WordPiece Tokenization:
['[UNK]', 'runn', '##ing']


In [9]:
import re
from typing import List


class TreebankWordTokenizer:
    def __init__(self):
        """
        Initialize the TreebankWordTokenizer.
        """
        self.contractions = {
            "won't": "will not",
            "can't": "cannot",
            "n't": " not",
            "'s": " 's",
            "'ve": " have",
            "'re": " are",
            "'d": " would",
            "'ll": " will",
            "'m": " am"
        }
        self.contractions_re = re.compile(r"(\w+)('t|'s|'ve|'re|'d|'ll|'m)")

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize the given text using Treebank Word Tokenization.

        Args:
            text (str): The input text to be tokenized.

        Returns:
            List[str]: A list of tokens.
        """
        # Handle contractions
        text = self.contractions_re.sub(self._replace_contractions, text)

        # Handle special cases
        text = re.sub(r"([.,:;?!])", r" \1 ", text)  # Add spaces around punctuation
        text = re.sub(r"[^a-zA-Z0-9.,:;?!]+", " ", text)  # Remove non-alphanumeric characters
        text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
        text = text.strip()

        # Tokenize the text
        tokens = text.split()
        return tokens

    def _replace_contractions(self, match):
        """
        Replace contractions in the matched text.

        Args:
            match: The matched object containing the contraction.

        Returns:
            str: The expanded form of the contraction.
        """
        contraction = match.group(0)
        if contraction in self.contractions:
            return self.contractions[contraction]
        else:
            return contraction


def main():
    # Example usage
    text = "I won't be able to attend the meeting. It's scheduled for 5 p.m. on Monday."
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    print("Original text:")
    print(text)
    print("\nTokens after Treebank Word Tokenization:")
    print(tokens)


if __name__ == "__main__":
    main()

Original text:
I won't be able to attend the meeting. It's scheduled for 5 p.m. on Monday.

Tokens after Treebank Word Tokenization:
['I', 'will', 'not', 'be', 'able', 'to', 'attend', 'the', 'meeting', '.', 'It', 's', 'scheduled', 'for', '5', 'p', '.', 'm', '.', 'on', 'Monday', '.']


In [10]:
from typing import List, Dict
from collections import defaultdict
import math


class UnigramLanguageModelTokenizer:
    def __init__(self, vocab_size: int, unk_token: str = "<UNK>"):
        """
        Initialize the UnigramLanguageModelTokenizer.

        Args:
            vocab_size (int): The desired vocabulary size.
            unk_token (str): The token to use for unknown subwords. Default is "<UNK>".
        """
        self.vocab_size = vocab_size
        self.unk_token = unk_token
        self.vocab: Dict[str, int] = {}
        self.subword_counts: Dict[str, int] = defaultdict(int)

    def train(self, corpus: List[str]):
        """
        Train the Unigram Language Model on the given corpus.

        Args:
            corpus (List[str]): The corpus of text to train on.
        """
        # Count the frequency of each subword in the corpus
        for text in corpus:
            for i in range(len(text)):
                for j in range(i + 1, len(text) + 1):
                    subword = text[i:j]
                    self.subword_counts[subword] += 1

        # Build the vocabulary based on the most frequent subwords
        sorted_subwords = sorted(self.subword_counts.items(), key=lambda x: x[1], reverse=True)
        self.vocab = {subword: i for i, (subword, _) in enumerate(sorted_subwords[:self.vocab_size])}
        self.vocab[self.unk_token] = self.vocab_size

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize the given text using the trained Unigram Language Model.

        Args:
            text (str): The input text to be tokenized.

        Returns:
            List[str]: A list of subword tokens.
        """
        tokens = []
        n = len(text)
        i = 0
        while i < n:
            max_score = -math.inf
            best_subword = None
            for j in range(i + 1, n + 1):
                subword = text[i:j]
                if subword in self.vocab:
                    score = self.subword_counts[subword]
                    if score > max_score:
                        max_score = score
                        best_subword = subword
            if best_subword is None:
                best_subword = self.unk_token
            tokens.append(best_subword)
            i += len(best_subword)
        return tokens


def main():
    # Example usage
    corpus = [
        "The quick brown fox jumps over the lazy dog.",
        "The quick brown fox jumps over the lazy dog.",
        "The quick brown fox jumps over the lazy dog."
    ]
    vocab_size = 10

    tokenizer = UnigramLanguageModelTokenizer(vocab_size)
    tokenizer.train(corpus)

    text = "The quick brown fox jumps over the lazy dog."
    tokens = tokenizer.tokenize(text)

    print("Original text:")
    print(text)
    print("\nTokens after Unigram Language Model Tokenization:")
    print(tokens)


if __name__ == "__main__":
    main()

Original text:
The quick brown fox jumps over the lazy dog.

Tokens after Unigram Language Model Tokenization:
['T', 'h', 'e', ' ', '<UNK>', ' ', '<UNK>', ' ', '<UNK>', 'u', '<UNK>', '<UNK>', 'h', 'e', ' ', '<UNK>', '<UNK>']


In [None]:
import sentencepiece as spm
from typing import List

def train_sentencepiece_model(texts: List[str], model_prefix: str, vocab_size: int = 32000) -> None:
    """
    Trains a SentencePiece model with the given list of strings.

    :param texts: The list of text strings to train the model on.
    :param model_prefix: The prefix for the output model files.
    :param vocab_size: The size of the vocabulary. Default is 32,000.
    """
    # Prepare the text file required for SentencePiece training
    with open('text.txt', 'w', encoding='utf-8') as f:
        for text in texts:
            f.write(f"{text}\n")
    
    # Train the SentencePiece model
    spm.SentencePieceTrainer.train(f'--input=text.txt --model_prefix={model_prefix} --vocab_size={vocab_size} --character_coverage=1.0')
    print(f"Model trained and saved with prefix '{model_prefix}'.")

def tokenize_with_sentencepiece(model_path: str, text: str) -> List[str]:
    """
    Tokenizes a text string using a trained SentencePiece model.

    :param model_path: The path to the trained SentencePiece model.
    :param text: The text to be tokenized.
    :return: A list of subword tokens.
    """
    # Load the trained SentencePiece model
    sp = spm.SentencePieceProcessor(model_file=model_path)
    
    # Tokenize the text
    tokens = sp.encode(text, out_type=str)
    
    return tokens

# Example usage
if __name__ == "__main__":
    # List of sentences to train the SentencePiece model
    user_input_texts = [
        "This is a test sentence for the SentencePiece model.",
        "SentencePiece is an unsupervised text tokenizer and detokenizer.",
        "It provides open-source pre-built and extensible models for various languages."
    ]
    
    # Train the model and save with the provided prefix
    model_prefix = 'spm_model'
    train_sentencepiece_model(user_input_texts, model_prefix)
    
    # Load the trained model and tokenize a new sentence
    model_path = f'{model_prefix}.model'
    new_text = "SentencePiece handles multiple languages without pre-tokenization."
    tokens = tokenize_with_sentencepiece(model_path, new_text)
    
    # Output the result
    print("Tokens:", tokens)

In [None]:
import sentencepiece as spm
from typing import List


class SentencePieceTokenizer:
    def __init__(self, model_path: str):
        """
        Initialize the SentencePieceTokenizer.

        Args:
            model_path (str): The path to the pre-trained SentencePiece model.
        """
        self.model_path = model_path
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path)

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize the given text using the SentencePiece model.

        Args:
            text (str): The input text to be tokenized.

        Returns:
            List[str]: A list of subword tokens.
        """
        tokens = self.sp.EncodeAsPieces(text)
        return tokens

    def detokenize(self, tokens: List[str]) -> str:
        """
        Detokenize the given list of subword tokens back into text.

        Args:
            tokens (List[str]): A list of subword tokens.

        Returns:
            str: The detokenized text.
        """
        text = self.sp.DecodePieces(tokens)
        return text


def train_sentencepiece_model(corpus_path: str, model_prefix: str, vocab_size: int):
    """
    Train a SentencePiece model on the given corpus.

    Args:
        corpus_path (str): The path to the corpus file.
        model_prefix (str): The prefix for the output model files.
        vocab_size (int): The desired vocabulary size.
    """
    spm.SentencePieceTrainer.Train(
        f"--input={corpus_path} --model_prefix={model_prefix} --vocab_size={vocab_size}"
    )


def main():
    # Train a SentencePiece model (replace with your own corpus and paths)
    corpus_path = "path/to/corpus.txt"
    model_prefix = "path/to/model"
    vocab_size = 10000
    train_sentencepiece_model(corpus_path, model_prefix, vocab_size)

    # Example usage
    model_path = "path/to/model.model"
    tokenizer = SentencePieceTokenizer(model_path)

    text = "This is a sample text in English. これは日本語のサンプルテキストです。"
    tokens = tokenizer.tokenize(text)
    print("Original text:")
    print(text)
    print("\nTokens after SentencePiece Tokenization:")
    print(tokens)

    detokenized_text = tokenizer.detokenize(tokens)
    print("\nDetokenized text:")
    print(detokenized_text)


if __name__ == "__main__":
    main()

In [None]:
import morfessor
from typing import List, Tuple

def train_morfessor_model(data: List[str]) -> morfessor.MorfessorIO:
    """
    Trains a Morfessor model with the given list of words.

    :param data: The list of words to train the model on.
    :return: A trained Morfessor model.
    """
    model = morfessor.BaselineModel()
    # Prepare data for training (list of tuples (word, count))
    training_data = [(word, 1) for word in data]
    model.load_data(training_data)
    model.train_batch()

    return model

def segment_words_with_morfessor(model: morfessor.MorfessorIO, words: List[str]) -> List[List[str]]:
    """
    Segments a list of words using a trained Morfessor model.

    :param model: The trained Morfessor model.
    :param words: The list of words to segment.
    :return: A list of segmented word lists.
    """
    segmented_words = [model.viterbi_segment(word)[0] for word in words]
    return segmented_words

# Example usage
if __name__ == "__main__":
    # List of words to train the Morfessor model
    user_input_words = [
        "morphological", "segmentation", "unsupervised", "learning", "linguistics",
        "morpheme", "analysis", "computational", "recognition", "algorithm"
    ]
    
    # Train the Morfessor model
    trained_model = train_morfessor_model(user_input_words)
    
    # Segment new words using the trained model
    new_words = ["morphology", "linguist", "algorithmic", "computationally"]
    segmented_output = segment_words_with_morfessor(trained_model, new_words)
    
    # Output results
    for word, segmented in zip(new_words, segmented_output):
        print(f"Original Word: {word}, Segmented: {segmented}")

In [None]:
import morfessor
from typing import List


class MorfessorTokenizer:
    def __init__(self, model_path: str):
        """
        Initialize the MorfessorTokenizer.

        Args:
            model_path (str): The path to the trained Morfessor model.
        """
        self.model_path = model_path
        self.model = morfessor.BaselineModel()
        self.model.load_model(model_path)

    def tokenize(self, word: str) -> List[str]:
        """
        Tokenize the given word into morphemes using the Morfessor model.

        Args:
            word (str): The input word to be tokenized.

        Returns:
            List[str]: A list of morphemes.
        """
        morphemes = self.model.viterbi_segment(word)[0]
        return morphemes

    def tokenize_sentence(self, sentence: str) -> List[List[str]]:
        """
        Tokenize the given sentence into morphemes using the Morfessor model.

        Args:
            sentence (str): The input sentence to be tokenized.

        Returns:
            List[List[str]]: A list of lists, where each inner list contains the morphemes of a word.
        """
        words = sentence.split()
        tokenized_sentence = [self.tokenize(word) for word in words]
        return tokenized_sentence


def train_morfessor_model(corpus_path: str, model_path: str):
    """
    Train a Morfessor model on the given corpus.

    Args:
        corpus_path (str): The path to the corpus file.
        model_path (str): The path to save the trained model.
    """
    corpus = morfessor.MorfessorIO().read_corpus_file(corpus_path)
    model = morfessor.BaselineModel()
    model.train(corpus)
    model.save_model(model_path)


def main():
    # Train a Morfessor model (replace with your own corpus and paths)
    corpus_path = "path/to/corpus.txt"
    model_path = "path/to/model.bin"
    train_morfessor_model(corpus_path, model_path)

    # Example usage
    tokenizer = MorfessorTokenizer(model_path)

    word = "unbelievable"
    morphemes = tokenizer.tokenize(word)
    print(f"Morphemes for '{word}':")
    print(morphemes)

    sentence = "The quick brown fox jumps over the lazy dog."
    tokenized_sentence = tokenizer.tokenize_sentence(sentence)
    print("\nTokenized sentence:")
    print(tokenized_sentence)


if __name__ == "__main__":
    main()