In [61]:
import numpy as np
import os
import sys
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation

In [62]:
# 一、数据预处理
BASE_DIR = 'E:/PycharmProjects/Tensorflow-stater'
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000 # 一份文本中最大字数
MAX_NB_WORDS = 20000 # 全部文本中最大unique字数
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
batch_size = 32

# 1.将记录在文件中的glove转为变量
embeddings_index={} # 字-向量 词典
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [63]:
# print(embeddings_index['the']

In [64]:
# 2. 预备文本和标签
texts = []
labels_index={} # 标签名- id 字典
labels = []
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                texts.append(f.read())
                f.close()
                labels.append(label_id)
print('Found %s texts.' % len(texts))

Found 19997 texts.


In [65]:
# labels_index['alt.atheism']

In [66]:
# labels[:10]

In [67]:
# len(texts[2])

In [69]:
# print(texts[2])

In [70]:
# 二、分词、建词表
# 1. 将每一份文本都映射成一个2D integer tensor
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, ) # Tokenizer实际是帮助建立一个 唯一字-id 表 ，预设 唯一字个数 最大为MAX_MB_WORDS
tokenizer.fit_on_texts(texts) # 将文本列表texts装载给tokenizer建立字表
sequences = tokenizer.texts_to_sequences(texts) #用tokenizer将1D的texts转换为2D的sequences，内容为 字id 

word_index = tokenizer.word_index # 词-id 字典 
print('Found %s unique tokens.' % len(word_index))

Found 214873 unique tokens.


In [None]:
len(word_inedx)

In [71]:
# word_index['newsgroups']

In [72]:
# sequences[2][:20]
len(sequences[0])

1655

In [73]:
# 三、生成Train和Validate数据集
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # 统一一份文本的长度
labels = to_categorical(np.asarray(labels)) # 将数字label转化成one-hot的向量形式
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


In [89]:
print(labels)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [75]:
# 将数据分成训练集和验证集
indices = np.arange(data.shape[0])
# print(indices)
np.random.shuffle(indices) # 洗牌
# print(indices)

[    0     1     2 ... 19994 19995 19996]
[ 7890 12566 12452 ... 13801 13485  1910]


In [77]:
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) # 以 验证集比例 生成 验证集数量

In [82]:
x_train = data[:-nb_validation_samples] # 从头开始到末尾倒数 验证集数量 分数
y_train = labels[: -nb_validation_samples]

In [85]:
x_train.shape

(15998, 1000)

In [86]:
y_train.shape

(15998, 20)

In [87]:
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [88]:
print('Preparing embedding matrix')

Preparing embedding matrix


In [93]:
len(data[2])

1000

In [122]:
# data[2]

In [125]:
# 四、生成Embedding Matrix = id-向量 表
nb_words = min(MAX_NB_WORDS, len(word_index)) 
# len(word_index)  # 214873

214873

In [129]:
embedding_matrix = np.zeros((nb_words+1, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)

(20001, 100)


In [156]:
from tensorflow.keras.layers import Lambda

In [157]:
import tensorflow.keras.backend as K

In [None]:
from tensorflow.keras.optimizer import Adam

In [160]:
# 五、LSTM构建
embedding_layer = Embedding(nb_words+1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False
                            )
print('Build model...')
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout=0.2, return_sequences=True))
model.add(Lambda(lambda x: K.mean(x, 1)))
model.add(Dense(100, activation='sigmoid'))
model.add(Dense(len(labels_index),activation='softmax'))

Build model...


In [161]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1000, 100)         2000100   
_________________________________________________________________
lstm_5 (LSTM)                (None, 1000, 100)         80400     
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_11 (Dense)             (None, 20)                2020      
Total params: 2,092,620
Trainable params: 92,520
Non-trainable params: 2,000,100
_________________________________________________________________


In [162]:
# 六、LSTM训练
# 1. 编译
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# 2.拟合
model.fit(x_train, y_train, batch_size = 128, nb_epoch = 5,
         validation_data=(x_val, y_val))
score, acc = model.evalueate(x_val,y_val, batch_size=256)
print('Test score:', score)
print('Test accuary:',acc)

Train on 15998 samples, validate on 3999 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
 2304/15998 [===>..........................] - ETA: 10:05 - loss: 2.5418 - acc: 0.1211

KeyboardInterrupt: 