In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

### Step 0. Loading dataset

#### Step 0.1 load article cutted and article df and define y

In [6]:
df = pd.read_csv('../data/1999_preprocessed.csv')
df['type'] = pd.factorize(df['type'])[0] + 1
#diff_threshold = 20
#df = df[abs(df['push']-df['boo']) > diff_threshold].copy()
#df['type'] = np.clip(df['push']-df['boo'], 0, 1)
df.head()


Unnamed: 0,id,content,type,date,idx
0,1800000321,無法透過 CAMP 上SIT 系統 icon 的連接登入. 可否協助看一下這問題,1,20180213,0
1,1800000321,無法透過 CAMP 上SIT 系統 icon 的連接登入. 可否協助看一下這問題,1,20180213,1
2,1600001420,收到CAMP 簽核表單，開頭CAF_QRDC，不清楚這是什麼簽核表單,1,20160604,2
3,1600001420,收到CAMP 簽核表單，開頭CAF_QRDC，不清楚這是什麼簽核表單,1,20160604,3
4,1500003557,"無法會員登入,顯示訊息:無效的帳號或使用者不存在。帳號:96122401",1,20151228,4


#### Step 0.2 create word id mapping and word vector

In [7]:
X = df.content
Y = df.type
le = LabelEncoder()  
Y = le.fit_transform(Y) # 這邊Y.shape = (84020, )
Y = Y.reshape(-1,1) # 將Y的shape轉換成： Y.shape= (84020, 1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [13]:
max_words = 1000
max_len = 150  # 自行設置，本次文本長度雖然較短，但我們先不調整大小，直接跑跑看
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len) # 確保所有序列具有相同的形狀

In [26]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [29]:
word_index = tok.word_index
print('Found %s unique tokens.' % len(word_index))
# 轉成 Embedding 層的 input vector
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# 載入預訓模型，trainable = False 表示不重新計算
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


Found 10696 unique tokens.


NameError: name 'embeddings_index' is not defined

In [20]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(279,name='out_layer')(layer) # 最後一層參數值為類別數
    layer = Activation('softmax')(layer) #二元分類是'sigmoid'，多元分類上修改為'softmax'
    model = Model(inputs=inputs,outputs=layer)
    return model

In [22]:
model = RNN()
model.summary()
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=RMSprop(),metrics = ['accuracy'])

# train model
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=60,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_7 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 279)               71703     
__________

<keras.callbacks.History at 0x7f82c87b3be0>

### Time to train the network

In [24]:
# 處理測試集資料
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
# 使用測試集來進行模型評估
accr = model.evaluate(test_sequences_matrix,Y_test)

print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


Test set
  Loss: 1.498
  Accuracy: 0.501


In [25]:
model.predict(x=test_sequences_matrix)

array([[8.8203680e-03, 1.8507689e-03, 5.2329324e-02, ..., 1.9201932e-06,
        8.2414562e-07, 3.8573737e-07],
       [1.0795989e-02, 2.4887326e-03, 6.2162895e-02, ..., 3.5638093e-06,
        1.6331829e-06, 7.9870614e-07],
       [2.0646336e-02, 6.6088624e-03, 1.1201778e-01, ..., 2.9139997e-05,
        1.6787224e-05, 9.5554251e-06],
       ...,
       [9.0597160e-03, 1.9410875e-03, 5.3690419e-02, ..., 2.1251080e-06,
        9.2254027e-07, 4.3289384e-07],
       [8.8203680e-03, 1.8507689e-03, 5.2329324e-02, ..., 1.9201932e-06,
        8.2414562e-07, 3.8573737e-07],
       [9.0597160e-03, 1.9410875e-03, 5.3690419e-02, ..., 2.1251080e-06,
        9.2254027e-07, 4.3289384e-07]], dtype=float32)