In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np


seed_ = 20200218
np.random.seed(seed_)
tf.random.set_seed(seed_)

## Tensorflow Dataset

https://www.tensorflow.org/datasets

In [2]:
dataset, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [3]:
dataset

{'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [4]:
train_sentences = []
train_labels = []

for sentence, label in dataset['train']:
    train_sentences.append(str(sentence.numpy(), encoding='utf-8'))
    train_labels.append(label.numpy())
    
test_sentences = []
test_labels = []

for sentence, label in dataset['test']:
    test_sentences.append(str(sentence.numpy(), encoding='utf-8'))
    test_labels.append(label.numpy())

In [5]:
df_train = pd.DataFrame({
    'sentence': train_sentences,
    'label': train_labels
})

df_test = pd.DataFrame({
    'sentence': test_sentences,
    'label': test_labels
})

In [6]:
df_train.shape, df_test.shape

((25000, 2), (25000, 2))

In [7]:
df_train.head()

Unnamed: 0,sentence,label
0,This was an absolutely terrible movie. Don't b...,0
1,"I have been known to fall asleep during films,...",0
2,Mann photographs the Alberta Rocky Mountains i...,0
3,This is the kind of film for a snowy Sunday af...,1
4,"As others have mentioned, all the women that g...",1


In [8]:
df_test.head()

Unnamed: 0,sentence,label
0,There are films that make careers. For George ...,1
1,"A blackly comic tale of a down-trodden priest,...",1
2,"Scary Movie 1-4, Epic Movie, Date Movie, Meet ...",0
3,Poor Shirley MacLaine tries hard to lend some ...,0
4,As a former Erasmus student I enjoyed this fil...,1


## Preprocess Data

In [9]:
# config

# vocab size
num_words = 10000

embedding_dims = 16
max_length = 120
pad_trunc = 'post'
oov_token = '<OOV>'

In [10]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, 
                                                  oov_token=oov_token)

tokenizer.fit_on_texts(df_train['sentence'])

word_index = tokenizer.word_index

In [11]:
for k in list(word_index.keys())[:10]:
    print(f"{k}: {word_index[k]}")

<OOV>: 1
the: 2
and: 3
a: 4
of: 5
to: 6
is: 7
br: 8
in: 9
it: 10


In [12]:
train_sequences = tokenizer.texts_to_sequences(df_train['sentence'])
padded_train_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequences, truncating=pad_trunc, maxlen=max_length)

In [13]:
test_sequences = tokenizer.texts_to_sequences(df_test['sentence'])
padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences, truncating=pad_trunc, maxlen=max_length)

## Embedding

#### Embedding i.e. vector ของ word

การทำงานของ Embedding ไม่อยู่ในคอร์สนี้

### embedding คือ...
- word มีหลายคำที่ความหมายเดียวกัน ใกล้เคียงกัน
- dull = boring, fun = exiting
- word ที่ความหมายใกล้เคียงกันจะมี vector ใกล้เคียงกัน

### ในที่นี้ IMDB review
- review (-) คำว่า dull-boring ปรากฏเยอะ แสดงว่ามันมี sentiment ใกล้เคียงกัน
    - vector ใกล้เคียงกัน

## Model

### Model 1
- model แบบง่าย NN
- หลังจาก embedding จะได้ 2D array [sentence_length, embedding_dims]
- ต้องการใส่ไปใน NN ปกติ ต้อง Flatten ก่อน

In [14]:
model_1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_words, embedding_dims, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

In [15]:
model_1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


### model 1

- params ใน dense เยอะ
- params ที่ต้อง learn ใน embedding เกิดจาก dim*(n_words*n_words)

In [16]:
model_2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_words, embedding_dims, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

In [17]:
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


## Optimizer

In [18]:
optimizer = tf.keras.optimizers.Adam()

In [19]:
model_2.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
model_2.fit(padded_train_sequences, df_train['label'], batch_size=64, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x17e79b1d1c0>

In [21]:
model_2.evaluate(padded_train_sequences, df_train['label'])



[0.053350143134593964, 0.9907199740409851]

In [22]:
model_2.evaluate(padded_test_sequences, df_test['label'])



[0.8712406158447266, 0.7870000004768372]