In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

# จะใช้ Tensorflow API มา Encoding word กัน

- แทน word ด้วยตัวเลขอะไรก็ได้

In [2]:
with open("./data/ashes.txt", "r", encoding='utf-8') as f:
    lines = f.readlines()

In [3]:
# clean the texts
# my favorite song lyric

texts = [i.strip('\n') for i in lines if i.strip('\n') != '']

In [4]:
texts

['I could hear the signs calling out from the bottom of the fire',
 'I am like a torch flickering in the wind as the saying goes',
 'Lost all my precious',
 'Rage ate me up',
 'Endless forlornness has made me numb',
 "I'd rather rise from here",
 'Or should I hold on to my past?',
 'They’ve burnt to ashes',
 'Faded to grey',
 'Returned to the earth',
 "Yes it's meant to be",
 'Uncertain flame of hope I found',
 'Will you lead me back on the right track?']

## Tensorflow tokennizer

มีหลายวิธีที่จะ encoding โฟกัสที่อันนี้ก่อน

In [5]:
# config

num_words = 100

### Tokenizer

- กำหนดจำนวนคำมากที่สุดที่จะเก็บ (โดยความถี่ที่ปรากฏ)
- tf lower case ให้, จัดการเรื่อง punctuation ให้

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

#### การกำหนด `num_words` จะทำในขั้น `texts_to_sequences`

In [7]:
for k in list(word_index.keys())[:10]:
    print(f"{k}: {word_index[k]}")

the: 1
to: 2
i: 3
me: 4
from: 5
of: 6
my: 7
on: 8
could: 9
hear: 10


## แปลงข้อความให้กลายเป็นตัวเลข

sentence -> sequence

In [8]:
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[3, 9, 10, 1, 11, 12, 13, 5, 1, 14, 6, 1, 15],
 [3, 16, 17, 18, 19, 20, 21, 1, 22, 23, 1, 24, 25],
 [26, 27, 7, 28],
 [29, 30, 4, 31],
 [32, 33, 34, 35, 4, 36],
 [37, 38, 39, 5, 40],
 [41, 42, 3, 43, 8, 2, 7, 44],
 [45, 46, 2, 47],
 [48, 2, 49],
 [50, 2, 1, 51],
 [52, 53, 54, 2, 55],
 [56, 57, 6, 58, 3, 59],
 [60, 61, 62, 4, 63, 8, 1, 64, 65]]

## ปัญหาถ้ามีคำที่ไม่รู้จัก

จะไม่ encode ให้ ... ใน sequence จะมีแต่คำใน word index

In [9]:
tokenizer.texts_to_sequences(["The peaceful times have made us blind"])

[[1, 35]]

## `oov_token`, Out of Vocab ใน Tokenizer

เพื่อ handle ปัญหา การที่ประโยคมาใหม่ไม่พบคำใน word index ที่สร้างขึ้นมา

In [10]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

for k in list(word_index.keys())[:10]:
    print(f"{k}: {word_index[k]}")

<OOV>: 1
the: 2
to: 3
i: 4
me: 5
from: 6
of: 7
my: 8
on: 9
could: 10


In [11]:
tokenizer.texts_to_sequences(["The peaceful times have made us blind"])

[[2, 1, 1, 1, 36, 1, 1]]

## ปัญหาแต่ละ sequence ยาวไม่เท่ากัน

In [12]:
sequences = tokenizer.texts_to_sequences(texts)

In [13]:
sequences

[[4, 10, 11, 2, 12, 13, 14, 6, 2, 15, 7, 2, 16],
 [4, 17, 18, 19, 20, 21, 22, 2, 23, 24, 2, 25, 26],
 [27, 28, 8, 29],
 [30, 31, 5, 32],
 [33, 34, 35, 36, 5, 37],
 [38, 39, 40, 6, 41],
 [42, 43, 4, 44, 9, 3, 8, 45],
 [46, 47, 3, 48],
 [49, 3, 50],
 [51, 3, 2, 52],
 [53, 54, 55, 3, 56],
 [57, 58, 7, 59, 4, 60],
 [61, 62, 63, 5, 64, 9, 2, 65, 66]]

เมื่อต้องการ feed ข้อมูลเข้า NN ต้องการให้มี shape เดียวกัน

แก้ปัญหา sequence ยาวไม่เท่ากันด้วย `padding`

## Padding

https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
pad_sequences(sequences)

array([[ 4, 10, 11,  2, 12, 13, 14,  6,  2, 15,  7,  2, 16],
       [ 4, 17, 18, 19, 20, 21, 22,  2, 23, 24,  2, 25, 26],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 27, 28,  8, 29],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 30, 31,  5, 32],
       [ 0,  0,  0,  0,  0,  0,  0, 33, 34, 35, 36,  5, 37],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 38, 39, 40,  6, 41],
       [ 0,  0,  0,  0,  0, 42, 43,  4, 44,  9,  3,  8, 45],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 46, 47,  3, 48],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 49,  3, 50],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 51,  3,  2, 52],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 53, 54, 55,  3, 56],
       [ 0,  0,  0,  0,  0,  0,  0, 57, 58,  7, 59,  4, 60],
       [ 0,  0,  0,  0, 61, 62, 63,  5, 64,  9,  2, 65, 66]])

In [16]:
pad_sequences(sequences, padding='post')

array([[ 4, 10, 11,  2, 12, 13, 14,  6,  2, 15,  7,  2, 16],
       [ 4, 17, 18, 19, 20, 21, 22,  2, 23, 24,  2, 25, 26],
       [27, 28,  8, 29,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [30, 31,  5, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [33, 34, 35, 36,  5, 37,  0,  0,  0,  0,  0,  0,  0],
       [38, 39, 40,  6, 41,  0,  0,  0,  0,  0,  0,  0,  0],
       [42, 43,  4, 44,  9,  3,  8, 45,  0,  0,  0,  0,  0],
       [46, 47,  3, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [49,  3, 50,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [51,  3,  2, 52,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [53, 54, 55,  3, 56,  0,  0,  0,  0,  0,  0,  0,  0],
       [57, 58,  7, 59,  4, 60,  0,  0,  0,  0,  0,  0,  0],
       [61, 62, 63,  5, 64,  9,  2, 65, 66,  0,  0,  0,  0]])

In [17]:
pad_sequences(sequences, maxlen=10, truncating='pre')

array([[ 2, 12, 13, 14,  6,  2, 15,  7,  2, 16],
       [19, 20, 21, 22,  2, 23, 24,  2, 25, 26],
       [ 0,  0,  0,  0,  0,  0, 27, 28,  8, 29],
       [ 0,  0,  0,  0,  0,  0, 30, 31,  5, 32],
       [ 0,  0,  0,  0, 33, 34, 35, 36,  5, 37],
       [ 0,  0,  0,  0,  0, 38, 39, 40,  6, 41],
       [ 0,  0, 42, 43,  4, 44,  9,  3,  8, 45],
       [ 0,  0,  0,  0,  0,  0, 46, 47,  3, 48],
       [ 0,  0,  0,  0,  0,  0,  0, 49,  3, 50],
       [ 0,  0,  0,  0,  0,  0, 51,  3,  2, 52],
       [ 0,  0,  0,  0,  0, 53, 54, 55,  3, 56],
       [ 0,  0,  0,  0, 57, 58,  7, 59,  4, 60],
       [ 0, 61, 62, 63,  5, 64,  9,  2, 65, 66]])