In [8]:
!pip install -q tensorflow-text
!pip install --upgrade protobuf #如果import tensorflow出现错误，就更新这个包

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting protobuf
  Downloading protobuf-3.13.0-cp36-cp36m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 891 kB/s eta 0:00:01
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.12.2
    Uninstalling protobuf-3.12.2:
      Successfully uninstalled protobuf-3.12.2
Successfully installed protobuf-3.13.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [9]:
import tensorflow as tf
import tensorflow_text as text

In [10]:
docs = tf.constant([u'Everything not saved will be lost.'.encode('UTF-16-BE'), u'Sad☹'.encode('UTF-16-BE')])
utf8_docs = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8')

In [11]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())

Instructions for updating:
`tf.batch_gather` is deprecated, please use `tf.gather` with `batch_dims=-1` instead.
[[b'everything', b'not', b'saved', b'will', b'be', b'lost.'], [b'Sad\xe2\x98\xb9']]


In [12]:
tokenizer = text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())
# 结果就是属于不同unicode script的地方会被切断

[[b'everything', b'not', b'saved', b'will', b'be', b'lost', b'.'], [b'Sad', b'\xe2\x98\xb9']]


In [13]:
tokens = tf.strings.unicode_split([u"仅今年前It".encode('UTF-8')], 'UTF-8')
print(tokens.to_list())
# 暴力法，直接按照字符切断

[[b'\xe4\xbb\x85', b'\xe4\xbb\x8a', b'\xe5\xb9\xb4', b'\xe5\x89\x8d', b'I', b't']]


In [14]:
tokenizer = text.UnicodeScriptTokenizer()
(tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets(['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())
print(offset_starts.to_list())
print(offset_limits.to_list())

[[b'everything', b'not', b'saved', b'will', b'be', b'lost', b'.'], [b'Sad', b'\xe2\x98\xb9']]
[[0, 11, 15, 21, 26, 29, 33], [0, 3]]
[[10, 14, 20, 25, 28, 33, 34], [3, 6]]


In [15]:
#也就是说这玩意支持graph model
docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
iterator = iter(tokenized_docs)
print(next(iterator).to_list())
print(next(iterator).to_list())

[[b'Never', b'tell', b'me', b'the', b'odds.']]
[[b"It's", b'a', b'trap!']]


In [16]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Is capitalized?
f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
# Are all letters uppercased?
f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
# Does the token contain punctuation?
f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
# Is the token a number?
f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

print(f1.to_list())
print(f2.to_list())
print(f3.to_list())
print(f4.to_list())

[[True, False, False, False, False, False], [True]]
[[False, False, False, False, False, False], [False]]
[[False, False, False, False, False, True], [True]]
[[False, False, False, False, False, False], [False]]


In [17]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Ngrams, in this case bi-gram (n = 2)
bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN)

print(bigrams.to_list())

[[b'Everything not', b'not saved', b'saved will', b'will be', b'be lost.'], []]


In [18]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Ngrams, in this case bi-gram (n = 3)
bigrams = text.ngrams(tokens, 3, reduction_type=text.Reduction.STRING_JOIN)

print(bigrams.to_list())

[[b'Everything not saved', b'not saved will', b'saved will be', b'will be lost.'], []]


In [25]:
tokenizer = text.WhitespaceTokenizer()
tokens = [[1,2,3],[2,3,4]]

# Ngrams, in this case bi-gram (n = 3)
bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.SUM)

print(bigrams)


tf.Tensor(
[[3 5]
 [5 7]], shape=(2, 2), dtype=int32)
