# Natural Language Processing with Tensorflow

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

## Tokenization
Representing words with numbers

In [13]:
sentences = [
    "I love my cat",
    "I am my dog",
    "You love my dog!,"
    "Don't you forget about me!",
    "Sometimes giving up is the strong thing",
]

To handle words not seen in training samples, use the OOV (Out Of Vocabulary) token 

In [16]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'my': 2,
 'i': 3,
 'love': 4,
 'dog': 5,
 'you': 6,
 'cat': 7,
 'am': 8,
 "don't": 9,
 'forget': 10,
 'about': 11,
 'me': 12,
 'sometimes': 13,
 'giving': 14,
 'up': 15,
 'is': 16,
 'the': 17,
 'strong': 18,
 'thing': 19}

## Sequencing
Representing sentences as a sequence of tokens

In [15]:
text_seq = tokenizer.texts_to_sequences(sentences)
text_seq

[[2, 3, 1, 6],
 [2, 7, 1, 4],
 [5, 3, 1, 4, 8, 5, 9, 10, 11],
 [12, 13, 14, 15, 16, 17, 18]]

### Padding
Alternate to padding is using a Ragged Tensor

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
padded = pad_sequences(text_seq)

In [19]:
padded

array([[ 0,  0,  0,  0,  0,  2,  3,  1,  6],
       [ 0,  0,  0,  0,  0,  2,  7,  1,  4],
       [ 5,  3,  1,  4,  8,  5,  9, 10, 11],
       [ 0,  0, 12, 13, 14, 15, 16, 17, 18]])