<a href="https://colab.research.google.com/github/cool60334/111-TibaMe-AI-03-Deep-Learning/blob/main/DL_IMDB%E5%BD%B1%E8%A9%95%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)  # 取得路徑：/root/.keras/datasets
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))  # /root/.keras/datasets/aclImdb/train/pos/* 
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

# 建立模型

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

layers = [
    
    # 一篇文章蒐集512個詞彙, 這512詞彙是在3000最常用單字裡, 每一個詞彙化作100維度的向量

    Embedding(
          3001,  # 3001: 3000種常用詞彙 + 1個padding(0)
          100,  # 100: 訓練後的情緒個數(100-500)，問題越難參數設越多
          mask_zero=True,  # mask_zero=True -> 代表把詞數不夠的文章補0，之後不會帶入訓練
          input_length=512  # input_length: 一篇文章要看幾個詞彙(128-512)
          ),

    GlobalAveragePooling1D(),  # 只有100維度的向量，所以使用GAP1D
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


# 確定模型訓練方式

In [4]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

# 處理訓練資料

In [5]:
# Tokenizer -> 把詞轉換成向量(數字)

from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)  # 取3000個常用詞
tok.fit_on_texts(train_df["content"])

# 看一下fit的結果
# tok.word_index
# tok.index_word

In [6]:
# texts_to_sequences -> 將文章內的詞轉換成向量(數字)，並把3000常用詞以外的詞去除

x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,47,23,168,99,12,972,5,1751,14.0,70.0,...,,,,,,,,,,
1,1,1404,43,14,32,4,295,110,2.0,86.0,...,,,,,,,,,,
2,1542,1,744,16,731,351,36,1018,2016.0,2.0,...,,,,,,,,,,
3,1314,48,3,1,6,3,2064,16,955.0,4.0,...,,,,,,,,,,
4,10,293,1,1467,20,1,285,100,316.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,21,57,2047,437,31,3,4,475,106.0,153.0,...,,,,,,,,,,
24996,267,8,1222,2416,3,182,325,229,494.0,5.0,...,,,,,,,,,,
24997,11,19,2026,2024,16,2984,1039,34,45.0,100.0,...,,,,,,,,,,
24998,41,32,531,593,10,414,1764,146,11.0,391.0,...,,,,,,,,,,


In [7]:
# pad_sequences -> 將文章截長補短

from tensorflow.keras.preprocessing.sequence import pad_sequences

# padding = "pre" (預設值) -> 將文章內3000常用詞沒出現的詞補0，前面已做過
# truncating = "pre" (預設值) -> 保留後半部的文章

x_train_pad = pad_sequences(x_train_seq, maxlen=512)  # 保留文章最後512個詞
x_test_pad = pad_sequences(x_test_seq, maxlen=512)  # 保留文章最後512個詞
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,...,9,45,5,25,46,8,1,133,7,7
1,0,0,0,0,0,0,0,0,0,0,...,1110,450,12,1025,5,94,1,4,102,218
2,0,0,0,0,0,0,0,0,0,0,...,1484,2665,16,22,205,100,22,560,1,747
3,0,0,0,0,0,0,0,0,0,0,...,81,2,1041,718,12,90,69,852,58,389
4,0,0,0,0,0,0,0,0,0,0,...,8,60,11,19,6,726,5,10,383,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,604,11,19,14,9,6,16,32,8,226
24996,53,2,10,166,9,244,251,5,261,27,...,2,463,65,504,227,125,71,11,20,29
24997,0,0,0,0,0,0,0,0,0,0,...,245,198,20,175,42,50,71,10,67,190
24998,0,0,0,0,0,0,0,0,0,0,...,17,141,27,1146,54,1932,71,37,3,339


In [8]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])
y_train

array([1, 1, 1, ..., 0, 0, 0])

# 訓練模型

In [9]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

callbacks = [
    # ModelCheckpoint("sentiment.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]

model.fit(
      x_train_pad,
      y_train,
      batch_size=200,
      epochs=50,
      validation_split=0.1,
      callbacks=callbacks,
      verbose=2
     )

Epoch 1/50
113/113 - 3s - loss: 0.6422 - accuracy: 0.6300 - val_loss: 0.6657 - val_accuracy: 0.5976 - 3s/epoch - 27ms/step
Epoch 2/50
113/113 - 1s - loss: 0.4896 - accuracy: 0.8156 - val_loss: 0.4905 - val_accuracy: 0.7836 - 522ms/epoch - 5ms/step
Epoch 3/50
113/113 - 1s - loss: 0.3816 - accuracy: 0.8603 - val_loss: 0.4373 - val_accuracy: 0.8044 - 586ms/epoch - 5ms/step
Epoch 4/50
113/113 - 1s - loss: 0.3269 - accuracy: 0.8762 - val_loss: 0.3659 - val_accuracy: 0.8452 - 571ms/epoch - 5ms/step
Epoch 5/50
113/113 - 1s - loss: 0.2947 - accuracy: 0.8886 - val_loss: 0.3487 - val_accuracy: 0.8532 - 574ms/epoch - 5ms/step
Epoch 6/50
113/113 - 1s - loss: 0.2732 - accuracy: 0.8963 - val_loss: 0.3482 - val_accuracy: 0.8552 - 533ms/epoch - 5ms/step
Epoch 7/50
113/113 - 1s - loss: 0.2575 - accuracy: 0.9020 - val_loss: 0.3449 - val_accuracy: 0.8556 - 561ms/epoch - 5ms/step
Epoch 8/50
113/113 - 1s - loss: 0.2457 - accuracy: 0.9062 - val_loss: 0.3637 - val_accuracy: 0.8464 - 539ms/epoch - 5ms/step
Ep

<keras.callbacks.History at 0x7f0b02bc60d0>

In [10]:
model.evaluate(x_test_pad, y_test)



[0.29029127955436707, 0.8804799914360046]

# 真實電影評價正負評

In [11]:
layers = [
    Embedding(3001, 100, mask_zero=True),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
infer = Sequential(layers)
infer.layers[0].set_weights(model.layers[0].get_weights())  # 把之前訓練好的第一層係數拿來用，並且不限input的字數
infer.layers[2].set_weights(model.layers[2].get_weights())  # 把之前訓練好的第三層係數拿來用
infer.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 100)         300100    
                                                                 
 global_average_pooling1d_1   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [22]:
comment = "Avatar sequel recipe: throw Avatar and Titanic into blender, add general helping of corny cliches plus plenty of water, result is three hours of stunning visuals continuously interrupted by toe-cringing dialogue and plot.  Typical Hollywood, while there is no limit to expense when it comes to 3D special effects, very little is spent on actually preparing an interesting storyline - which here is completely one dimensional - or spent on creating characters with depth - which in this case all are completely 2D. As a matter of fact, I recall just one surprising behaviour in the whole film. The overall experience is a strangely building numbness to special effects accompanied by a sense of boredom with plot and increasing exasperation with dialogue." #@param {type:"string"}
seq = tok.texts_to_sequences([comment])
prob = infer.predict(seq)[0]
trans = ["neg", "pos"]
for t, p in zip(trans, prob):
    print(t, "的機率:", p)

neg 的機率: 0.9268646
pos 的機率: 0.07313537


# 詞語相似度比對

In [23]:
layers = [
    Embedding(3001, 100, mask_zero=True),
    GlobalAveragePooling1D(),
]
infer = Sequential(layers)
infer.layers[0].set_weights(model.layers[0].get_weights())
infer.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 100)         300100    
                                                                 
 global_average_pooling1d_2   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
Total params: 300,100
Trainable params: 300,100
Non-trainable params: 0
_________________________________________________________________


In [32]:
# 以方向的cos角度來比較相似性
# 0度：正面相似cos=1，90度：不相似cos=0，180度：反面相似cos=-1

from scipy.spatial.distance import cosine

comment1 = "joy" #@param {type:"string"}
comment2 = "pleasure" #@param {type:"string"}

seq1 = tok.texts_to_sequences([comment1])
v1 = infer.predict(seq1)[0]

seq2 = tok.texts_to_sequences([comment2])
v2 = infer.predict(seq2)[0]

# 越趨近於1越相似
print("相似度:", 1 - cosine(v1, v2))

相似度: 0.9885824918746948
