In [10]:
import json

#import tensorflow as tf
#from   tensorflow import keras
from   tensorflow.keras.preprocessing.sequence import pad_sequences
from   tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences= ['I love my dog', 
            'I love my cat', 
            'You love my dog!', 
            'Do you think my dog is amazing?']

In [3]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'my': 1,
 'love': 2,
 'dog': 3,
 'i': 4,
 'you': 5,
 'cat': 6,
 'do': 7,
 'think': 8,
 'is': 9,
 'amazing': 10}

In [4]:
sequences = tokenizer.texts_to_sequences(sentences)
for sent, seq in zip(sentences, sequences):
    print(f'{sent}: {seq}')

I love my dog: [4, 2, 1, 3]
I love my cat: [4, 2, 1, 6]
You love my dog!: [5, 2, 1, 3]
Do you think my dog is amazing?: [7, 5, 8, 1, 3, 9, 10]


### Out of Vocab `<OOV>` Token (AKA `<UNK>`)

In [5]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
for sent, seq in zip(sentences, sequences):
    print(f'{sent}: {seq}')

I love my dog: [5, 3, 2, 4]
I love my cat: [5, 3, 2, 7]
You love my dog!: [6, 3, 2, 4]
Do you think my dog is amazing?: [8, 6, 9, 2, 4, 10, 11]


In [6]:
test_data = ['I really love my dog', 'my dog loves my manatee']
test_seq = tokenizer.texts_to_sequences(test_data)
for sent, seq in zip(test_data, test_seq):
    print(f'{sent}: {seq}')

I really love my dog: [5, 1, 3, 2, 4]
my dog loves my manatee: [2, 4, 1, 2, 1]


### Padding

In [7]:
padded = pad_sequences(sequences) 
# Other params: padding='post', maxlen=7, truncating='post'
padded

array([[ 0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  5,  3,  2,  7],
       [ 0,  0,  0,  6,  3,  2,  4],
       [ 8,  6,  9,  2,  4, 10, 11]], dtype=int32)

# Sarcastic Headlines Corpus

In [15]:
# Data at
# https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
with open('../../../data/sarcasm.json', 'r') as f:
    data = json.load(f)

In [16]:
data[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [17]:
sentences = []
labels = []
urls = []

for item in data:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [18]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

In [25]:
word_index = tokenizer.word_index
print(len(word_index))
for k, v in list(word_index.items())[:10]:
    print(f'{k}: {v}')

29657
<OOV>: 1
to: 2
of: 3
the: 4
in: 5
for: 6
a: 7
on: 8
and: 9
with: 10


In [26]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(sentences[2])
print(padded[2])
print(padded.shape)

mom starting to fear son's web series closest thing she will have to grandchild
[  145   838     2   907  1749  2093   582  4719   221   143    39    46
     2 10736     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)
