In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = [
    "I love my dog",
    "I love my cat"
]

In [3]:
sentences

['I love my dog', 'I love my cat']

In [4]:
tokenizer = Tokenizer(num_words=100)

In [5]:
tokenizer.fit_on_texts(sentences)

In [6]:
word_index = tokenizer.word_index

In [7]:
word_index

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}

In [8]:
# sequences

In [9]:
sentences = [
    "I love my dog",
    "I love my cat",
    "you love my dog",
    "Do you think my dog is amazing"
]

In [10]:
sentences

['I love my dog',
 'I love my cat',
 'you love my dog',
 'Do you think my dog is amazing']

In [11]:
tokenizer = Tokenizer(num_words=100)

In [12]:
tokenizer.fit_on_texts(sentences)

In [13]:
word_index = tokenizer.word_index

In [14]:
word_index

{'my': 1,
 'love': 2,
 'dog': 3,
 'i': 4,
 'you': 5,
 'cat': 6,
 'do': 7,
 'think': 8,
 'is': 9,
 'amazing': 10}

In [15]:
sequences = tokenizer.texts_to_sequences(sentences)

In [16]:
sequences

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

In [17]:
test_data = [
    "i really love my dog",
    "my dog love my brother"
]

In [18]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [19]:
test_seq

[[4, 2, 1, 3], [1, 3, 2, 1]]

In [20]:
# Out of vocabulary

In [21]:
tokenizer = Tokenizer(num_words=100,oov_token='<oov>')

In [22]:
tokenizer.fit_on_texts(sentences)

In [23]:
word_index = tokenizer.word_index

In [24]:
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

In [25]:
test_data

['i really love my dog', 'my dog love my brother']

In [26]:
sequences = tokenizer.texts_to_sequences(sentences)

In [27]:
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

In [28]:
test_data

['i really love my dog', 'my dog love my brother']

In [29]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [30]:
test_seq

[[5, 1, 3, 2, 4], [2, 4, 3, 2, 1]]

In [31]:
import numpy as np

In [32]:
sequences = np.array(sequences)

  """Entry point for launching an IPython kernel.


In [33]:
sequences

array([list([5, 3, 2, 4]), list([5, 3, 2, 7]), list([6, 3, 2, 4]),
       list([8, 6, 9, 2, 4, 10, 11])], dtype=object)

In [34]:
# padding

In [35]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [38]:
sentences = [
    "I love my dog",
    "I love my beautiful cat",
    "you love my dog",
    "Do you think my dog is amazing"
]

In [39]:
sentences

['I love my dog',
 'I love my beautiful cat',
 'you love my dog',
 'Do you think my dog is amazing']

In [40]:
tokenizer = Tokenizer(num_words=100,oov_token='<oov>')

In [41]:
tokenizer.fit_on_texts(sentences)

In [42]:
word_index = tokenizer.word_index

In [43]:
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'beautiful': 7,
 'cat': 8,
 'do': 9,
 'think': 10,
 'is': 11,
 'amazing': 12}

In [44]:
sequences = tokenizer.texts_to_sequences(sentences)

In [45]:
sequences

[[5, 3, 2, 4], [5, 3, 2, 7, 8], [6, 3, 2, 4], [9, 6, 10, 2, 4, 11, 12]]

In [46]:
padded = pad_sequences(sequences)

In [47]:
padded

array([[ 0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  5,  3,  2,  7,  8],
       [ 0,  0,  0,  6,  3,  2,  4],
       [ 9,  6, 10,  2,  4, 11, 12]], dtype=int32)

In [48]:
padded = pad_sequences(sequences,padding='post')

In [49]:
padded

array([[ 5,  3,  2,  4,  0,  0,  0],
       [ 5,  3,  2,  7,  8,  0,  0],
       [ 6,  3,  2,  4,  0,  0,  0],
       [ 9,  6, 10,  2,  4, 11, 12]], dtype=int32)

In [50]:
padded = pad_sequences(sequences,padding='post',maxlen=5)

In [51]:
padded

array([[ 5,  3,  2,  4,  0],
       [ 5,  3,  2,  7,  8],
       [ 6,  3,  2,  4,  0],
       [10,  2,  4, 11, 12]], dtype=int32)

In [52]:
padded = pad_sequences(sequences,padding='post',maxlen=5,truncating='post')

In [53]:
padded

array([[ 5,  3,  2,  4,  0],
       [ 5,  3,  2,  7,  8],
       [ 6,  3,  2,  4,  0],
       [ 9,  6, 10,  2,  4]], dtype=int32)