### 0. 連線到我的 Google Drive 獲取資料

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/nlp-getting-started

/content/drive/MyDrive/nlp-getting-started


In [None]:
import pandas as pd 
df = pd.read_csv('./train.csv')

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### 1. 使用 Keras 來建立一個 Trasformer Input Block
https://github.com/keras-team/keras-io/blob/master/examples/nlp/text_classification_with_transformer.py

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [None]:
token_set = set()
for text in df['text'].values:
  for token in text.split():
    token_set.add(token)

In [None]:
vocab_size = len(token_set)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
num_words = 20000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

In [None]:
# Tokenize our training data
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(df['text'].values)

# Get our training data word index
word_index = tokenizer.word_index

# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(df['text'].values)

# Get max training sequence length
maxlen = max([len(x) for x in train_sequences])

# Pad the training sequences
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

# Output the results of our work
print("Word index:\n", word_index)
print("\nTraining sequences:\n", train_sequences)
print('\nMax Length: \n', maxlen)
print("\nPadded training sequences:\n", train_padded)
print("\nPadded training shape:", train_padded.shape)
print("Training sequences data type:", type(train_sequences))
print("Padded Training sequences data type:", type(train_padded))

Word index:

Training sequences:
 [[120, 4634, 25, 5, 869, 9, 22, 264, 139, 1620, 4635, 90, 41], [190, 46, 230, 800, 6955, 6956, 1405], [41, 1752, 1621, 8, 6957, 7, 6958, 25, 137, 6959, 21, 1753, 40, 442, 257, 58, 2159, 7, 715, 1406, 25, 1107], [836, 2922, 60, 4636, 1501, 257, 1406, 7, 97], [35, 101, 1222, 22, 321, 23, 6960, 2160, 31, 272, 23, 1501, 6961, 70, 6, 188], [2923, 379, 97, 1502, 801, 870, 7, 666, 6962, 564, 8, 1160, 400, 46, 4637, 1501], [219, 77, 871, 296, 1223, 837, 265, 9, 1754, 7, 6963, 1048, 2454, 1503], [47, 14, 231, 9, 5, 1934, 10, 11, 75, 111, 6, 46, 7, 5, 4638], [423, 44, 76, 257, 1305, 52, 7, 5, 595, 872, 5, 770], [47, 2455, 18, 5, 470, 12, 251, 8, 120, 279], [596, 60, 642, 23, 5, 280, 471, 36, 643], [873, 565, 2924, 12, 232, 2925, 3598, 691, 6, 716, 11, 201, 7, 565, 2924, 61, 176, 11, 322, 72, 61, 176, 11, 322, 72, 6964, 265], [2926, 265, 2161, 6965, 2924, 951, 58, 1935, 620, 359, 739, 3599], [219, 7, 4639, 952, 50, 1755, 4639], [233, 8, 188, 424, 14, 3600, 7, 292

In [None]:
embed_dim = 8  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 4  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(8, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
import numpy as np
def shuffled_data(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [None]:
split_ratio = 0.8
shuffle_x, shuffle_y = shuffled_data(train_padded, df['target'].values)
split_idx = int(len(shuffle_x) * split_ratio)
train_x, train_y, val_x, val_y = shuffle_x[:split_idx], shuffle_y[:split_idx], shuffle_x[split_idx:], shuffle_y[split_idx:]

In [None]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = model.fit(
    train_x, train_y, batch_size=256, epochs=10, validation_data=(val_x, val_y)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
