In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(  #用於下載文件並將其保存到本地文件系統中
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True, #解壓縮
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [3]:
test_df

Unnamed: 0,content,sentiment
0,ROCK STAR / (2001) *** (out of four)<br /><br ...,1
1,The Cure is a fantastic film about a boy with ...,1
2,"Not having seen this film in quite some time, ...",1
3,One of the flat-out drollest movies of all-tim...,1
4,"NVA combines eastalgia-humor, military comedy ...",1
...,...,...
24995,Strangers with candy overacts in all the wrong...,0
24996,"What a disaster! Normally, when one critiques ...",0
24997,The Robot vs. the Aztec Mummy was one of the s...,0
24998,I'm not prone to ranting and my expectations w...,0


在使用Keras的Embedding層時，mask_zero=True的意義是指在輸入序列中的零值將被當做"padding"，也就是無意義的填充值，因此這些零值將被標記為"masked"，在後續的運算中將被忽略掉，這有助於提高模型運算效率。例如，在進行序列的輸入時，如果序列長度不一，需要在較短的序列末尾填充零值以保持序列維度的一致性，但這些零值對模型的輸出並沒有貢獻，使用mask_zero=True可以讓模型忽略掉這些填充值的影響

GlobalAveragePooling1D 是一個池化層(pooling layer)，它是將一個 1D 的特徵序列進行池化，把每個序列的所有特徵值取平均，得到一個特徵向量作為全局池化的輸出。具體來說，GlobalAveragePooling1D 的作用是將一個序列的每個特徵的平均值作為序列的概括，可以減少序列的維度，並且可以捕捉序列的全局信息。

在深度學習中，GlobalAveragePooling1D 通常用於減少序列的維度，從而減少模型的參數量，防止過度擬合等。在文本分類等任務中，可以將一個文本序列映射成一個固定長度的向量，進行分類或回歸等任務。

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
# 3000種常用詞彙+1padding(0): 每一篇文章進入的時候只取512在常用詞彙列表的詞, 每一個詞化做100維度的語意像量
layers = [
    # 沒有激活, 3001(種詞彙) * 100 -> 300100
    Embedding(input_dim=3001, output_dim=100, mask_zero=True, input_length=512),#出現0直接忽略mask_zero=True
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [5]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       metrics=["accuracy"],
       optimizer="adam")
     

例如，如果我们使用 Tokenize 将一句话转换成数字，且这句话只包含前 3000 个常用单词，那么 input_dim 的值就应该是 3000。这样模型就能够处理这些单词的向量表示了，而超出这 3000 个单词的单词则会被舍弃或者转换成一个特定的词汇

In [6]:
# Tokenize: 詞彙換成數字, 建立一個3000常用詞彙辭典
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])#使用 fit_on_texts 方法将 Tokenizer 与数据拟合，以生成词汇表并计算出每个单词在数据中出现的频率。这样可以将文本转换为数字标记序列，
     

In [7]:

# tok.word_index
# tok.index_word
# 檢查: 這個case, 標點和換行是可以去掉的
# tok.word_index["?"]
# 停用詞(忽略一些無意義的): 不用, 根據答案就會把無意義的東西調整出來
     

Sequenize 是将文本数据中的单词序列转换为数字序列的过程，也就是将每个单词用其在词汇表中对应的编号替代。这样做的目的是让计算机能够更好地理解和处理文本数据。在这个例子中，使用了 tok.texts_to_sequences() 方法将训练集和测试集的文本数据转换为数字序列，并将其存储在 x_train_seq 和 x_test_seq 中。 pd.DataFrame(x_train_seq) 用于将 x_train_seq 转换为 Pandas DataFrame 格式并输出到控制台中。

In [8]:
import pandas as pd
# Sequenize: 把我的字轉換成數字(利用剛剛列表)
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)
     

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,389,1357,7,7,51,10,605,30,1.0,15.0,...,,,,,,,,,,
1,57,148,11,17,6,176,151,54,548.0,86.0,...,,,,,,,,,,
2,10,1056,3,224,4,11,20,1067,695.0,2.0,...,,,,,,,,,,
3,10,1816,120,16,3,324,1918,705,18.0,204.0,...,,,,,,,,,,
4,3,758,4,1059,309,392,1534,294,3.0,2735.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,2,419,38,93,140,11,1976,706,60.0,44.0,...,,,,,,,,,,
24996,45,10,97,199,9,3,1454,471,650.0,58.0,...,,,,,,,,,,
24997,100,109,2177,31,440,236,14,3,8.0,1.0,...,,,,,,,,,,
24998,11,17,6,32,5,19,228,10,188.0,262.0,...,,,,,,,,,,


pad_sequences用于将序列长度标准化。在这里，我们将训练和测试数据集中的所有序列长度都标准化为512。如果一个序列的长度小于512，则在开头用0填充。如果一个序列的长度大于512，则截断到512个单词。函数返回的是一个numpy数组，它的形状是(样本数量, maxlen)，其中每个样本的长度都是maxlen。上述代码将训练和测试数据集的每个样本都用0填充或截断到长度为512，然后将其保存到x_train_pad和x_test_pad中，最后通过pd.DataFrame将它们转换为DataFrame进行查看。

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)#其中每个样本的长度都是maxlen
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,...,5,1,372,1706,2,190,1,233,1370,341
1,0,0,0,0,0,0,0,0,0,0,...,37,543,108,405,34,25,293,9,487,27
2,0,0,0,0,0,0,0,0,0,0,...,2,96,487,178,5,398,9,742,772,2095
3,0,0,0,0,0,0,0,0,0,0,...,48,56,6,2246,4,900,76,142,5,12
4,0,0,0,0,0,0,0,0,0,0,...,24,2547,1,1605,624,124,1,2863,49,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,32,4,1199,250,8,1,223,297,238,36
24996,0,0,0,0,0,0,0,0,0,0,...,21,167,5,78,70,452,141,27,443,302
24997,0,0,0,0,0,0,0,0,0,0,...,344,4,805,119,94,69,1822,287,690,979
24998,0,0,0,0,0,0,0,0,0,0,...,43,47,12,188,76,2,33,120,11,592


In [10]:
#这段代码是将训练集和测试集的情感标签转化为NumPy数组，以便后续模型的训练和评估。
#分别获取了训练集和测试集中的情感标签列，然后通过np.array()函数将它们转化为NumPy数组，并分别赋值给了y_train和y_test变量。
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])
     

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
c = [
    ModelCheckpoint("imdb.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad,
     y_train,
     batch_size=100,
     epochs=40,
     validation_split=0.1,
     callbacks=c)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40


<keras.callbacks.History at 0x7f114dfeaad0>

In [12]:
model.evaluate(x_test_pad, y_test)



[0.30931827425956726, 0.8746799826622009]

In [13]:
review = "It's barely even a movie, given it functions mostly as a big, long ad for Nintendo products (Mario games, in this case). It has that lovely Illumination\u2122 style of humor throughout it all, the stunning shallow writing we've all grown to love from them, and the immaculate character development we certainly expect to be blown away by. It's awful, really. There are so many plot points that make zero sense and function merely as a way of getting the film moving without really adding anything to the story, it all feels infinitely pointless and hollow.  At least the world they built is pretty, that's the one positive."#@param {type:"string"}
seq = tok.texts_to_sequences([review])
pad = pad_sequences(seq, maxlen=512)
pre = model.predict(pad)
prob = pre[0]
trans = ["neg", "pos"]
for p, label in zip(prob, trans):
    print(label, "的機率是:", p)

neg 的機率是: 0.8148325
pos 的機率是: 0.18516748
