In [1]:
from tokenizer import get_tokenizer_from_saved_model, parse_saved_model, tokenize
from utils import download_thhub_model, get_path_without_extension, unpack_tar

In [2]:
# init variable
thhub_model_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
save_model_path = ".cache/universal-sentence-encoder-multilingual_3.tar"

In [3]:
# load and unpack model
download_thhub_model(
    thhub_model_url=thhub_model_url,
    save_model_path=save_model_path,
)
unpack_tar(path=save_model_path)

In [4]:
# get tokenizer
tokenizer = get_tokenizer_from_saved_model(
    parse_saved_model(
        get_path_without_extension(save_model_path)
    )
)

### use tokenizer

In [5]:
# Some texts of different lengths.
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]

In [6]:
for sentence in english_sentences:
    tokenized_sentence = tokenize(
        sentence=sentence,
        tokenizer=tokenizer,
    )
    print(f"{sentence} -> {tokenized_sentence}")

dog -> ['▁dog']
Puppies are nice. -> ['▁Pupp', 'ies', '▁are', '▁nice', '.']
I enjoy taking long walks along the beach with my dog. -> ['▁I', '▁enjoy', '▁taking', '▁long', '▁walk', 's', '▁along', '▁the', '▁beach', '▁with', '▁my', '▁dog', '.']


In [7]:
for sentence in italian_sentences:
    tokenized_sentence = tokenize(
        sentence=sentence,
        tokenizer=tokenizer,
    )
    print(f"{sentence} -> {tokenized_sentence}")

cane -> ['▁cane']
I cuccioli sono carini. -> ['▁I', '▁cu', 'ccioli', '▁sono', '▁car', 'ini', '.']
Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane. -> ['▁Mi', '▁piace', '▁fare', '▁lunghe', '▁passeggiat', 'e', '▁lungo', '▁la', '▁spiaggia', '▁con', '▁il', '▁mio', '▁cane', '.']


In [8]:
for sentence in japanese_sentences:
    tokenized_sentence = tokenize(
        sentence=sentence,
        tokenizer=tokenizer,
    )
    print(f"{sentence} -> {tokenized_sentence}")

犬 -> ['▁', '犬']
子犬はいいです -> ['▁', '子', '犬', 'は', 'いい', 'です']
私は犬と一緒にビーチを散歩するのが好きです -> ['▁私', 'は', '犬', 'と一緒に', 'ビーチ', 'を', '散', '歩', 'するのが', '好き', 'です']
