In [1]:
import os
import re
import string

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, TextVectorization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

  from .autonotebook import tqdm as notebook_tqdm


# データセットの用意

事前学習済み単語埋め込みとしてGloVe, 分類データセットとしてIMDBレビューデータセットを用いる.

In [1]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d data

--2022-03-16 11:19:31--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-03-16 11:19:32--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-03-16 11:22:15 (5.07 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: data/glove.6B.50d.txt   
  inflating: data/glove.6B.100d.txt  
  inflating: data/glove.6B.200d.txt  
  inflating: data/glove.6B.300d.txt  


In [2]:
# IMDBレビューデータセットはtensorflow datasetsから利用する
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:80%]', 'train[80%:]', 'test'),
    as_supervised=True
)

2022-03-16 17:05:25.290799: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0a:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-16 17:05:25.388855: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0a:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-16 17:05:25.389069: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0a:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-16 17:05:25.390572: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [3]:
# テキストの例 否定的なレビューはラベルが0, 肯定的なレビューはラベルが1
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch

2022-03-16 17:05:32.087195: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell 

In [4]:
train_labels_batch

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0])>

In [5]:
# 変数定義
BASE_DIR = "data"
GLOVE_PATH = os.path.join(BASE_DIR,"glove.6B.100d.txt")

# モデルの学習のためのパラメータ
MAX_SEQUENCE_LENGTH = 250
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

# 前処理

In [6]:
def custom_standardization(input_data):
    # 小文字化
    lowercase = tf.strings.lower(input_data)
    # htmlタグの削除
    stripped_html = tf.strings.regex_replace(lowercase,"<br />"," ")
    # 句読点の削除
    cleaned_html = tf.strings.regex_replace(stripped_html,
                                           "[%s]"%re.escape(string.punctuation),
                                           "")
    return cleaned_html

In [7]:
# データの標準化, トークン化, ベクトル化
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=MAX_NUM_WORDS,
output_mode="int",
output_sequence_length=MAX_SEQUENCE_LENGTH)

In [8]:
# ボキャブラリを作成するためにadaptを呼び出す
train_text = train_data.map(lambda x,y:x)
vectorize_layer.adapt(train_text)

In [9]:
# 変換結果
print(train_examples_batch[0])
print(vectorize_layer(train_examples_batch))

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
tf.Tensor(
[[  11   13   33 ...    0    0    0]
 [  10   25   74 ...    0    0    0]
 [4149 5732    2 ...    0    0    0]
 ...
 [   2   19    7 ...    0    0    0]
 [  10   62  112 ...    0    0    0]
 [ 247   11   28 ...    0    0    0]], shape=(10, 2

In [10]:
vectorize_layer.vocabulary_size()

20000

In [11]:
# 変換用の関数
def vectorize_text(text, label):
    return vectorize_layer(text), label

vectorize_text(train_examples_batch, train_labels_batch)

(<tf.Tensor: shape=(10, 250), dtype=int64, numpy=
 array([[  11,   13,   33, ...,    0,    0,    0],
        [  10,   25,   74, ...,    0,    0,    0],
        [4149, 5732,    2, ...,    0,    0,    0],
        ...,
        [   2,   19,    7, ...,    0,    0,    0],
        [  10,   62,  112, ...,    0,    0,    0],
        [ 247,   11,   28, ...,    0,    0,    0]])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0])>)

In [12]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_data.shuffle(
buffer_size=len(train_data),
reshuffle_each_iteration=True).batch(32).map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = validation_data.batch(32).map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_data.batch(32).map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)

In [13]:
# 埋め込み行列の準備
# 単語のインデックスとベクトルをマッピング
embeddings_index = {}
with open(os.path.join(GLOVE_PATH)) as f:
    for line in f:
        values = line.split()
        word=values[0]
        coefs = np.asarray(values[1:],dtype="float32")
        embeddings_index[word] = coefs
print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))

# 行は単語、列はGloVeから得た埋め込みに対応
num_words = min(MAX_NUM_WORDS,vectorize_layer.vocabulary_size())+1
embedding_matrix = np.zeros((num_words,EMBEDDING_DIM))
for i,word in enumerate(vectorize_layer.get_vocabulary()):
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # 単語が見つからないときはゼロベクトルのまま
        embedding_matrix[i] = embedding_vector
    
# 埋め込みを更新しないように、trainable=Falseを設定していることに注意
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    embeddings_initializer=Constant(embedding_matrix),
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False,
    mask_zero=True,
)

Found 400000 word vectors in Glove embeddings.


# 事前学習済み埋め込みを用いたCNN

In [14]:
# モデルの構築
cnnmodel = Sequential()
cnnmodel.add(embedding_layer)
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(1, activation='sigmoid'))
cnnmodel.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['acc']
)

# モデルの学習
cnnmodel.fit(
    train_ds,
    validation_data=val_ds,
    epochs=1,
)

# モデルの評価
score, acc = cnnmodel.evaluate(test_ds)
print('Test accuracy with CNN:', acc)

2022-03-16 17:06:54.594160: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8204


Test accuracy with CNN: 0.8150799870491028


# CNN

In [15]:
# モデルの構築
cnnmodel = Sequential()
cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(1, activation='sigmoid'))
cnnmodel.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# モデルの学習
cnnmodel.fit(
    train_ds,
    validation_data=val_ds,
    epochs=1
)

#　評価
score, acc = cnnmodel.evaluate(test_ds)
print('Test accuracy with CNN:', acc)

Test accuracy with CNN: 0.8624399900436401


# LSTM

In [16]:
# モデルの構築
rnnmodel = Sequential()
rnnmodel.add(Embedding(MAX_NUM_WORDS, 128, mask_zero=True))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(1, activation='sigmoid'))
rnnmodel.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# モデルの学習
rnnmodel.fit(
    train_ds,
    validation_data=val_ds,
    epochs=1
)

# モデルの評価
score, acc = rnnmodel.evaluate(test_ds)
print('Test accuracy with RNN:', acc)





Test accuracy with RNN: 0.8361999988555908


# 事前学習済み埋め込みを用いたLSTM

In [18]:
# モデルの構築
rnnmodel2 = Sequential()
rnnmodel2.add(embedding_layer)
rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel2.add(Dense(1, activation='sigmoid'))
rnnmodel2.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# モデルの学習
rnnmodel2.fit(
    train_ds,
    validation_data=val_ds,
    epochs=1
)

# モデルの評価
score, acc = rnnmodel2.evaluate(test_ds)
print('Test accuracy with RNN:', acc)





Test accuracy with RNN: 0.7925199866294861
