In [207]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model,Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding,GRU,Conv1D, MaxPooling1D,GlobalMaxPooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import EarlyStopping

import jieba
import jieba.posseg as pseg

from keras.layers.wrappers import Bidirectional

# Step 0. Loading dataset

#### Step 0.1 load article cutted and article df and define y

In [208]:
df = pd.read_csv('../data/1999_preprocessed.csv')
df['type'] = pd.factorize(df['type'])[0] + 1
#diff_threshold = 20
#df = df[abs(df['push']-df['boo']) > diff_threshold].copy()
#df['type'] = np.clip(df['push']-df['boo'], 0, 1)
df.head()

Unnamed: 0,id,content,type,date,idx
0,1800000321,無法透過 CAMP 上SIT 系統 icon 的連接登入. 可否協助看一下這問題,1,20180213,0
1,1800000321,無法透過 CAMP 上SIT 系統 icon 的連接登入. 可否協助看一下這問題,1,20180213,1
2,1600001420,收到CAMP 簽核表單，開頭CAF_QRDC，不清楚這是什麼簽核表單,1,20160604,2
3,1600001420,收到CAMP 簽核表單，開頭CAF_QRDC，不清楚這是什麼簽核表單,1,20160604,3
4,1500003557,"無法會員登入,顯示訊息:無效的帳號或使用者不存在。帳號:96122401",1,20151228,4


In [209]:
# 資料打散
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,id,content,type,date,idx
0,APP0038374,user無法login.,7,20110426,4277
1,APP0019359,1、因现有报废程式内有公司自制半成品99*阶及2*、3*阶料号的Docking及小板可以直接...,3,20080307,1418
2,APP0020747,登入SAP VD02 KEY IN 問題,7,20080617,356
3,APP0000935,"user欲從QRDCFS02上下載SAP檔案,但系統顯示此資料夾其權限不足,煩請Sworm協...",7,20050815,5863
4,APP0001552,"User 反應 218端BOM 用量不對,料號3123MMB00B5,請速連繫# 3158",4,20050920,200


#### Step 0.2 create word id mapping and word vector

In [210]:
X = df.content
Y = df.type
le = LabelEncoder()  
Y = le.fit_transform(Y) # 這邊Y.shape = (84020, )
Y = Y.reshape(-1,1) # 將Y的shape轉換成： Y.shape= (84020, 1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

# label 做 onehot
y_one_hot = np_utils.to_categorical(Y)
print (y_one_hot[0])
# 總共類別數
num_classes = len((y_one_hot[0]))
print ('num_classes = {}'.format(num_classes))

[0. 0. 0. 0. 0. 0. 1.]
num_classes = 7


In [211]:
# 標點符號過濾
WORD_FILTERS = '!"#$&()*+,-./:;<=>?@[\\]^_{|}~\t\n'
# 字典數量
NUM_WORDS = 100
# 向量長度
MAX_LEN = 100

# 將訓練資料的單字轉成向量
#total
tokenizer_obj= Tokenizer()
total_reviews= X_train + X_test
#tokenizer_obj.fit_on_texts(total_reviews)
#max_len=max([len(s.split()) for s in total_reviews])
#vocab_size=len(tokenizer_obj.word_index)+1

In [213]:
## set dictionary (can define yourself)
jieba.set_dictionary('../NLP/jieba/dict.txt.big')
stop_words = open('../NLP/jieba/stop_words.txt', encoding="utf-8").read().splitlines()


Exception: jieba: file does not exist: /home/jovyan/Davis_Practice/NLP/part4/NLP/jieba/dict.txt.big

In [198]:
# 將訓練資料的單字轉成向量
#X_train
tokenizer = Tokenizer(filters=WORD_FILTERS,
                        num_words=NUM_WORDS,
                        split=",")
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 8981 unique tokens.


In [199]:
# 將訓練字句截長補短
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len) # 確保所有序列具有相同的形狀

In [200]:
print (X_train.shape)

(5139,)


In [201]:
print (Y_train.shape)

(5139, 1)


In [202]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(279,name='out_layer')(layer) # 最後一層參數值為類別數
    layer = Activation('softmax')(layer) #二元分類是'sigmoid'，多元分類上修改為'softmax'
    model = Model(inputs=inputs,outputs=layer)
    return model

In [203]:
def RNN_GRU():
    sequence_input = Input(shape=(max_len,), dtype='int32')
    embedded_sequences = Embedding(max_words,50,input_length=max_len)(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(num_classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [204]:
def build_model(num_classes, vocab_size, sentence_max_len):
    print(num_classes)
    print(vocab_size)
    print(sentence_max_len)
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=256, 
                        input_length=sentence_max_len))
    model.add(Bidirectional(LSTM(128,implementation=2)))
#     model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.summary()
    return model



In [205]:
#model = RNN()
#model.summary()
#model.compile(loss = 'sparse_categorical_crossentropy', optimizer=RMSprop(),metrics = ['accuracy'])


model = build_model(num_classes, NUM_WORDS, MAX_LEN)
model.summary()
model.compile(loss = 'categorical_crossentropy', optimizer='Adam',metrics = ['accuracy'])



# train model
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=100,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])


7
100
100
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 256)          25600     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_10 (Dense)             (None, 7)                 1799      
Total params: 421,639
Trainable params: 421,639
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 256)          25600     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               394240    
____________________________________________________________

ValueError: Error when checking input: expected embedding_13_input to have shape (100,) but got array with shape (1000,)

### Time to train the network

In [17]:
# 處理測試集資料
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
# 使用測試集來進行模型評估
accr = model.evaluate(test_sequences_matrix,Y_test)

print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


Test set
  Loss: 1.275
  Accuracy: 0.551
