In [1]:
#感情分析LSTMを使用
import numpy as np
import tensorflow as tf

In [2]:
with open('./reviews.txt', 'r') as f:
    reviews = f.read()

In [3]:
reviews[:200]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  '

In [4]:
with open('./labels.txt','r') as f:
    labels = f.read()

In [5]:
labels[:200]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npo'

In [6]:
from string import punctuation
#punctuation : reviews を一文字ずつ取り出し、句読点、ピリオドでない場合は一文字ずつ格納
all_text = ''.join([c for c in reviews if c not in punctuation])
#句読点、ピリオドを除いた改行コード単位で行に分けて格納
reviews = all_text.split('\n')

#wordsを空白でsplitする。単語ごとに格納する
all_text = ' '.join(reviews)
words = all_text.split()

In [7]:
all_text[:5]

'bromw'

In [8]:
reviews[:2]

['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ',
 'story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  viol

In [9]:
words[:5]

['bromwell', 'high', 'is', 'a', 'cartoon']

In [10]:
from collections import Counter
#wordsに入っている単語のカウントを行う
counts = Counter(words)
#counts順にwordsをソートされたものをvocabに入れる
vocab = sorted(counts, key=counts.get, reverse = True)

In [11]:
#ソートされたwordsに番号を振る 
vocab_to_int = {word:ii for ii ,word in enumerate(vocab, 1)}

In [12]:
reviews_int = []
#文章（改行ごと）を単語に分解し、割り振った数値を当てはめる
#これで文章を数値として扱える
for each in reviews:
    reviews_int.append([vocab_to_int[word] for word in each.split()])

In [13]:
#ラベルをベクトルにする

In [14]:
labels = labels.split('\n')
#positiveは「１」とする
labels = np.array([1 if each == 'positive' else 0 for each in labels])

In [15]:
#単語数をカウントする
review_lens = Counter([len(x) for x in reviews_int])

In [16]:
review_lens[0]

1

In [17]:
reviews_int[:1]

[[21025,
  308,
  6,
  3,
  1050,
  207,
  8,
  2138,
  32,
  1,
  171,
  57,
  15,
  49,
  81,
  5785,
  44,
  382,
  110,
  140,
  15,
  5194,
  60,
  154,
  9,
  1,
  4975,
  5852,
  475,
  71,
  5,
  260,
  12,
  21025,
  308,
  13,
  1978,
  6,
  74,
  2395,
  5,
  613,
  73,
  6,
  5194,
  1,
  24103,
  5,
  1983,
  10166,
  1,
  5786,
  1499,
  36,
  51,
  66,
  204,
  145,
  67,
  1199,
  5194,
  19869,
  1,
  37442,
  4,
  1,
  221,
  883,
  31,
  2988,
  71,
  4,
  1,
  5787,
  10,
  686,
  2,
  67,
  1499,
  54,
  10,
  216,
  1,
  383,
  9,
  62,
  3,
  1406,
  3686,
  783,
  5,
  3483,
  180,
  1,
  382,
  10,
  1212,
  13583,
  32,
  308,
  3,
  349,
  341,
  2913,
  10,
  143,
  127,
  5,
  7690,
  30,
  4,
  129,
  5194,
  1406,
  2326,
  5,
  21025,
  308,
  10,
  528,
  12,
  109,
  1448,
  4,
  60,
  543,
  102,
  12,
  21025,
  308,
  6,
  227,
  4146,
  48,
  3,
  2211,
  12,
  8,
  215,
  23]]

In [18]:
#長さが「0」でない文章のインデックスを格納
non_zero_idx = [ii for ii, review in enumerate(reviews_int) if len(review) != 0]

In [19]:
len(non_zero_idx)

25000

In [20]:
reviews_int[-1]

[]

In [21]:
#ゼロの長さ以外を格納する
reviews_int = [reviews_int[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [22]:
#レビューの長さを制限
seq_len = 200
#freaturesベクトルの初期化
features = np.zeros((len(reviews_int), seq_len), dtype=int)

In [23]:
for i, row in enumerate(reviews_int):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [24]:
#features[:10,:100]

In [25]:
#データを訓練用、検証用に分析

In [26]:
split_flac = 0.8
split_idx = int(len(features) * split_flac)

In [27]:
#featuresの訓練、検証用データの作成
train_x,val_x = features[:split_idx], features[split_idx:]
train_y,val_y = labels[:split_idx],labels[split_idx:]

In [28]:
test_idx = int(len(val_x) * 0.5)

In [29]:
#Validationの訓練、検証用データの作成
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

In [30]:
print("Trainset:\t\t{}".format(train_x.shape))
print("Validation set:\t{}".format(val_x.shape))
print("Test set: \t\t{}".format(test_x.shape))

Trainset:		(20000, 200)
Validation set:	(2500, 200)
Test set: 		(2500, 200)


##グラフの定義

In [31]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [44]:
n_words = len(vocab_to_int) + 1

graph = tf.Graph()
with graph.as_default():
    #inputs_、labels_を初期化。placeholderは値の入れ物
    inputs_ = tf.placeholder(tf.int32,[None,None],name = 'input')
    labels_  = tf.placeholder(tf.int32,[None,None],name = 'labels')
    #ドロップアウトの割合
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')

In [45]:
#分析する単語を限定して計算効率化する

In [46]:
embed_size = 300

with graph.as_default():
    #random_uniform:一様分布
    embedding = tf.Variable(tf.random_uniform((n_words,embed_size),-1,1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [47]:
#74073 * 300　の行列で-1～1の値を取る
embedding

<tf.Variable 'Variable:0' shape=(74073, 300) dtype=float32_ref>

In [48]:
embed

<tf.Tensor 'embedding_lookup:0' shape=(?, ?, 300) dtype=float32>

In [49]:
with graph.as_default():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob = keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)

In [50]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell,embed,initial_state=initial_state)

In [51]:
with graph.as_default():
    #全結合
    predictions = tf.contrib.layers.fully_connected(outputs[:,-1],1,activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_,predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

##学習制度の計測

In [52]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

##バッチを返すモジュールを定義

In [61]:
#x:レビューの文章のデータ
#y:正解データ
#バッチデータの個数
def get_batches(x,y,batch_size=100):
    #バッチ数を取得
    n_batches = len(x)//batch_size
    #バッチ数*バッチサイズをかけたものを格納
    x,y = x[:n_batches * batch_size],y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        #yield ; 戻り値を返す。ただし呼び出されるたびiiが変わる
        yield x[ii:ii+batch_size],y[ii:ii+batch_size]

＃＃トレーニング

In [62]:
epochs = 10
with graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        for ii, (x,y) in enumerate(get_batches(train_x,train_y,batch_size),1):
            feed = {inputs_:x,
                   labels_: y[:,None],
                   keep_prob:0.5,
                   initial_state:state}
            loss, state, _= sess.run([cost, final_state,optimizer], feed_dict=feed)
            
            if iteration % 5 == 0:
                print("Epoch:{}/{}".format(e,epochs),
                     "Iteration:{}".format(iteration),
                     "Training Loss:{:.3f}".format(loss))
            
            if iteration % 25 == 0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size,tf.float32))
                for x,y in get_batches(val_x,val_y,batch_size):
                    feed = {inputs_: x,
                           labels_:y[:,None],
                           keep_prob: 1,
                           initial_state: val_state}
                    batch_acc,val_state = sess.run([accuracy,final_state],feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Value Acc:{:.3f}".format(np.mean(val_acc)))
            
            iteration += 1
        #一時的にファイル出力
        saver.save(sess,"checkpoint/sentiment.ckpt")

Epoch:0/10 Iteration:5 Training Loss:0.240
Epoch:0/10 Iteration:10 Training Loss:0.235
Epoch:0/10 Iteration:15 Training Loss:0.213
Epoch:0/10 Iteration:20 Training Loss:0.217
Epoch:0/10 Iteration:25 Training Loss:0.214
Value Acc:0.664
Epoch:0/10 Iteration:30 Training Loss:0.204
Epoch:0/10 Iteration:35 Training Loss:0.194
Epoch:0/10 Iteration:40 Training Loss:0.196
Epoch:1/10 Iteration:45 Training Loss:0.159
Epoch:1/10 Iteration:50 Training Loss:0.233
Value Acc:0.608


KeyboardInterrupt: 

In [63]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess,tf.train.latest_checkpoint('checkpoint'))
    test_state = sess.run(cell.zero_state(batch_size,tf.float32))
    for ii , (x,y) in enumerate(get_batches(test_x,test_y,batch_size),1):
        feed = {inputs_:x,
               labels_:y[:,None],
               keep_prob:1,
               initial_state:test_state}
        batch_acc,test_state = sess.run([accuracy,final_state],feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test Accuracy:{:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoint\sentiment.ckpt
Test Accuracy:0.739
