In [None]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import numpy as np
import re
re_tag = re.compile(r'<[^>]+>')
 
def rm_tags(text):
    return re_tag.sub('', text)
 
import os
def read_files(filetype):
    path = "C:/aclImdb/"
    file_list=[]
 
    positive_path=path + filetype+"/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
    
    negative_path=path + filetype+"/neg/"
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]
        
    print('read',filetype, 'files:',len(file_list))
       
    all_labels = ([1] * 12500 + [0] * 12500) 
    
    all_texts  = []
    
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return np.array(all_labels),np.array(all_texts)
 
y_train,train_text=read_files("train")
y_test,test_text=read_files("test")
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)
x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen=380)
 
#####建立模型
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers import LSTM
 
model = Sequential()
 
model.add(Embedding(output_dim=32,
                    input_dim=3800, 
                    input_length=380))
model.add(Dropout(0.35))
 
model.add(LSTM(units=16))
 
model.add(Dense(units=256,activation='relu' ))
 
model.add(Dropout(0.35))
 
model.add(Dense(units=1,activation='sigmoid' ))
 
model.summary()
 
#####训练模型
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
 
train_history =model.fit(x_train, y_train,batch_size=100, 
                         epochs=10,verbose=2,
                         validation_split=0.2)
 
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
 
show_train_history(train_history,'accuracy','val_accuracy')
show_train_history(train_history,'loss','val_loss')
 
#####评估模型的准确率
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]
 
#####预测概率
probility=model.predict(x_test)
probility[:10]
 
for p in probility[12500:12510]:
    print(p)
 
#####预测结果
predict=model.predict_classes(x_test)
 
predict[:10]
predict.shape
predict_classes=predict.reshape(25000)
predict_classes
 
#####查看预测结果
SentimentDict={1:'正面的',0:'负面的'}
def display_test_Sentiment(i):
    print(test_text[i])
    print('label真实值:',SentimentDict[y_test[i]],
          '预测结果:',SentimentDict[predict_classes[i]])
 
display_test_Sentiment(2)
 
display_test_Sentiment(3)
predict_classes[12500:12510]
display_test_Sentiment(12502)
'''
注：以下是程序输出（不包括此句）
First of all I hate those moronic rappers, who could'nt act if they had a gun pressed against their foreheads. All they do is curse and shoot each other and acting like cliché'e version of gangsters.The movie doesn't take more than five minutes to explain what is going on before we're already at the warehouse There is not a single sympathetic character in this movie, except for the homeless guy, who is also the only one with half a brain.Bill Paxton and William Sadler are both hill billies and Sadlers character is just as much a villain as the gangsters. I did'nt like him right from the start.The movie is filled with pointless violence and Walter Hills specialty: people falling through windows with glass flying everywhere. There is pretty much no plot and it is a big problem when you root for no-one. Everybody dies, except from Paxton and the homeless guy and everybody get what they deserve.The only two black people that can act is the homeless guy and the junkie but they're actors by profession, not annoying ugly brain dead rappers.Stay away from this crap and watch 48 hours 1 and 2 instead. At lest they have characters you care about, a sense of humor and nothing but real actors in the cast.
label真实值: 负面的 预测结果: 负面的
'''
 
#预测新的影评
input_text='''
I can't vote because I have not watched this movie yet. I've been wanting to watch this movie since the time they announced making it which is about 2 years ago (!)
I was planning to go with the family to see the anticipated movie but my nieces had school exams at the opening time so we all decided to wait for the next weekend. I was utterly shocked to learn yesterday that they pulled the movie from the Kuwaiti theaters "temporarily" so that the outrageous censorship system can remove some unwanted scenes.
The controversial gay "moment" according to my online research is barely there, so I can't find any logical reason for all the fuss that's been going on. And it was bad enough when fanatics and haters tried (in vain) to kill the movie with low ratings and negative reviews even before it was in the cinemas and I'm pretty sure most of those trolls never got the chance to watch the movie at that time.
Based on the trailers, I think the movie is very promising and entertaining and you can't simply overlook the tremendous efforts made to bring this beloved tale to life. To knock down hundreds of people's obvious hard work with unprofessional critique and negative reviews just for the sake of hatred is unfathomable. I hope people won't judge a movie before having the experience of watching it in the first place.
Impatiently waiting for the Kuwaiti cinemas to bring back the movie... 
'''
input_seq = token.texts_to_sequences([input_text])
pad_input_seq  = sequence.pad_sequences(input_seq , maxlen=380)
predict_result=model.predict_classes(pad_input_seq)
SentimentDict[predict_result[0][0]]
'''
'负面的'
'''