In [6]:
import pickle
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Embedding,Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# load dataset
# ['evaluation'] is feature, ['label'] is label
def load_data(filepath,input_shape=20):
    df=pd.read_csv(filepath,encoding='gbk')

    
    labels,vocabulary=list(df['label'].unique()),list(df['evaluation'].unique())

    string=''
    for word in vocabulary:
        string+=word

    vocabulary=set(string)

    word_dictionary={word:i+1 for i,word in enumerate(vocabulary)}
    with open('word_dict.pk','wb') as f:
        pickle.dump(word_dictionary,f)
    inverse_word_dictionary={i+1:word for i,word in enumerate(vocabulary)}
    label_dictionary={label:i for i,label in enumerate(labels)}
    with open('label_dict.pk','wb') as f:
        pickle.dump(label_dictionary,f)
    output_dictionary={i:labels for i,labels in enumerate(labels)}

    # vocabulary set
    vocab_size=len(word_dictionary.keys())
    # lablel number
    label_size=len(label_dictionary.keys())

    # padding
    x=[[word_dictionary[word] for word in sent] for sent in df['evaluation']]
    x=pad_sequences(maxlen=input_shape,sequences=x,padding='post',value=0)
    y=[[label_dictionary[sent]] for sent in df['label']]
  

    y=[to_categorical(label,num_classes=label_size) for label in y]
    y=np.array([list(_[0]) for _ in y])

    return x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary




In [10]:
def create_LSTM(n_units,input_shape,output_dim,filepath):
    x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary=load_data(filepath)
    model=Sequential()
    model.add(Embedding(input_dim=vocab_size+1,output_dim=output_dim,
                        input_length=input_shape,mask_zero=True))
    model.add(LSTM(n_units,input_shape=(x.shape[0],x.shape[1])))
    model.add(Dropout(0.2))
    model.add(Dense(label_size,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    

    plot_model(model, show_dtype=True, 
                       show_layer_names=True, show_shapes=True,  
                       to_file='model.png')
    model.summary()

    return model



In [8]:

def model_train(input_shape,filepath,model_save_path):

    x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary=load_data(filepath,input_shape)
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.2,random_state=42)

    # parameter
    n_units=100
    batch_size=32
    epochs=5
    output_dim=20

    # train
    lstm_model=create_LSTM(n_units,input_shape,output_dim,filepath)
    lstm_model.fit(train_x,train_y,epochs=epochs,batch_size=batch_size,verbose=1)

    # save
    lstm_model.save(model_save_path)

    
    N= test_x.shape[0]
    predict=[]
    label=[]
    for start,end in zip(range(0,N,1),range(1,N+1,1)):
        # print(f'start:{start}, end:{end}')
        sentence=[inverse_word_dictionary[i] for i in test_x[start] if i!=0]
        y_predict=lstm_model.predict(test_x[start:end])
        # print('y_predict:',y_predict)
        label_predict=output_dictionary[np.argmax(y_predict[0])]
        label_true=output_dictionary[np.argmax(test_y[start:end])]
        # print(f'label_predict:{label_predict}, label_true:{label_true}')

        # print(''.join(sentence),label_true,label_predict)
        predict.append(label_predict)
        label.append(label_true)


    acc=accuracy_score(predict,label)
    print('accuarcy:%s'%acc)



In [11]:
if __name__=='__main__':
    filepath='weibo_statement.csv'
    input_shape=180
    model_save_path='/Users/lifeifan/Desktop/weibo_sentiment/corpus_model.h5'
    model_train(input_shape,filepath,model_save_path)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 180, 20)           118600    
                                                                 
 lstm_1 (LSTM)               (None, 100)               48400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 167202 (653.13 KB)
Trainable params: 167202 (653.13 KB)
Non-trainable params: 0 (0.00 Byte)
_______________________________________________________________

  saving_api.save_model(


accuarcy:0.983
