# Week 1: https://www.coursera.org/learn/natural-language-processing-tensorflow/home/week/1

In [1]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [129]:
sentences = [
    'I love my dog',
    'I love my cat',
    'Do you know my cat',
    'Dog is amazing',
    'He loves my dog',
    'Freek is my name'
]

## Tokenizer

In [131]:
tokenizer = Tokenizer(num_words = 20, oov_token="<OOV>") # token should not be a real word!

In [132]:
tokenizer.fit_on_texts(sentences)

In [133]:
word_index = tokenizer.word_index

In [134]:
print(word_index)

{'<OOV>': 1, 'my': 2, 'dog': 3, 'i': 4, 'love': 5, 'cat': 6, 'is': 7, 'do': 8, 'you': 9, 'know': 10, 'amazing': 11, 'he': 12, 'loves': 13, 'freek': 14, 'name': 15}


## Apply tokenizer on train data

In [136]:
sequences = tokenizer.texts_to_sequences(sentences)

In [137]:
sequences

[[4, 5, 2, 3],
 [4, 5, 2, 6],
 [8, 9, 10, 2, 6],
 [3, 7, 11],
 [12, 13, 2, 3],
 [14, 7, 2, 15]]

## Padding

In [139]:
# resize all sentences to same size, and create matrix
padded = pad_sequences(
sequences,
padding='post',  # truncate sentences at the beginning
truncating='post',  # truncate sentence at the end
maxlen=3
) 


In [140]:
padded

array([[ 4,  5,  2],
       [ 4,  5,  2],
       [ 8,  9, 10],
       [ 3,  7, 11],
       [12, 13,  2],
       [14,  7,  2]])

## Apply tokenizer on test data

In [146]:
test_data =[
    'Hello my world',
    'He owns me some money'
]

In [147]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [150]:
padded = pad_sequences(test_seq, maxlen=4)

In [151]:
padded # leading zero for first sentence

array([[0, 1, 2, 1],
       [1, 1, 1, 1]])

## Process public data

In [38]:
import json
# import urllib
# import requests

In [183]:
# !wget --no-check-certificate \
#     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
#     -O /tmp/sarcasm.json

In [114]:
sarcasm_data = [json.loads(line) for line in open('data/sarcasm.json', 'r')]
# sarcasm_data = []
# for line in open('data/sarcasm.json', 'r'):
#     sarcasm_data.append(json.loads(line))

In [159]:
sentences = [] # 7082 records
labels = []
urls = []
for item in sarcasm_data:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

## Tokenizer

In [161]:
tokenizer2 = Tokenizer(oov_token="<OOV>")

In [162]:
tokenizer2.fit_on_texts(sentences)

In [163]:
word_index = tokenizer2.word_index

In [172]:
sequences = tokenizer2.texts_to_sequences(sentences) # sequence from text

In [176]:
padded = pad_sequences(sequences, padding='post', maxlen=8)