In [89]:
#encoding=utf-8
import sys
import itertools
import operator
import numpy as np
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

# 读取数据

In [286]:
sentences = []
labels = []
x = []
y = []

char2id = {}
id2char = {}
label2id = {}
id2label = {}
char_id = 0
label_id = 0

with open("data/train.txt", "rb") as infile:
    for row in infile:
        row = row.strip().decode("utf-8")
        items = row.split()
        if len(items) == 0:
            sentences.append(x)
            labels.append(y)
            x = []
            y = []
        else:
            i = char2id.setdefault(items[0], char_id)
            id2char.setdefault(i, items[0])
            if i == char_id:
                char_id += 1
            j = label2id.setdefault(items[1], label_id)
            id2label.setdefault(j, items[1])
            if j == label_id:
                label_id += 1
            x.append(i)
            y.append(j)
print "data size: " + str(len(sentences))

data size: 29


In [287]:
sentences = np.array(sentences)
labels = np.array(labels)
shuffle_data = True
if shuffle_data:
    sh = np.arange(len(sentences))
    np.random.shuffle(sh)
    sentences = sentences[sh]
    labels = labels[sh]
    
train_size = int(len(sentences) * 0.8)
test_size = int(len(sentences) * 0.2)

X_train = sentences[:train_size]
y_train = labels[:train_size]
X_test = sentences[train_size:]
y_test = labels[train_size:]

vocabulary_size = len(id2char.keys())
label_size = len(id2label.keys())

print "vocabulary size: " + str(vocabulary_size)
print "label size: " + str(label_size)
print "train size: " + str(train_size)
print "test size: " + str(test_size)

vocabulary size: 206
label size: 30
train size: 23
test size: 5


# 模型

In [273]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=20, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [274]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [275]:
def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [276]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [277]:
# Limit to 1000 examples to save time
print "Expected Loss for random predictions: %f" % np.log(vocabulary_size)
print "Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000])

Expected Loss for random predictions: 5.327876
Actual loss: 0.857475


In [278]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [279]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return 
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)

RNNNumpy.gradient_check = gradient_check

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.
Gradient check for parameter U passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.




In [280]:
# Performs one step of SGD.
def numpy_sdg_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [281]:
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print "%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss)
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print "Setting learning rate to %f" % learning_rate
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [288]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train, y_train, nepoch=500, evaluate_loss_after=1)

2016-07-19 10:27:10: Loss after num_examples_seen=0 epoch=0: 5.324849
2016-07-19 10:27:10: Loss after num_examples_seen=23 epoch=1: 5.313896
2016-07-19 10:27:10: Loss after num_examples_seen=46 epoch=2: 5.300916
2016-07-19 10:27:10: Loss after num_examples_seen=69 epoch=3: 5.284040
2016-07-19 10:27:10: Loss after num_examples_seen=92 epoch=4: 5.259822
2016-07-19 10:27:10: Loss after num_examples_seen=115 epoch=5: 5.215450
2016-07-19 10:27:10: Loss after num_examples_seen=138 epoch=6: 4.833301
2016-07-19 10:27:10: Loss after num_examples_seen=161 epoch=7: 3.980198
2016-07-19 10:27:10: Loss after num_examples_seen=184 epoch=8: 3.610934
2016-07-19 10:27:10: Loss after num_examples_seen=207 epoch=9: 3.420193
2016-07-19 10:27:10: Loss after num_examples_seen=230 epoch=10: 3.297428
2016-07-19 10:27:11: Loss after num_examples_seen=253 epoch=11: 3.209645
2016-07-19 10:27:11: Loss after num_examples_seen=276 epoch=12: 3.140909
2016-07-19 10:27:11: Loss after num_examples_seen=299 epoch=13: 3.0

# 评测

In [289]:
hit = 0
num = 0
for record in zip(X_test, y_test):
    x = record[0]
    y_true = record[1]
    y_pred = model.predict(x)
    print ''.join([id2char[val] for val in x])
    print [id2label[val] for val in y_pred]
    for i in range(len(y_true)):
        if y_true[i] == label2id["OOO"]:
            continue
        if y_true[i] == y_pred[i]:
            hit += 1
        num += 1
print "accuracy: " + str(1.0 * hit / num)

右侧肾脏及右侧输尿管未见异常。
[u'EOB', u'EOM', u'EOE', u'EAB', u'EAE', u'EOB', u'EOE', u'OOO', u'EOM', u'EAE', u'MNB', u'MNE', u'MSB', u'MSE', u'OOO']
心脏可见增大，心包可见少量积液。
[u'EOB', u'EOE', u'OOO', u'OOO', u'MSB', u'MSE', u'OOO', u'EAB', u'EAE', u'OOO', u'EAB', u'OOO', u'EOM', u'EOE', u'EPSE', u'OOO']
部分椎体前后缘轻度骨质增生。
[u'MSB', u'EAB', u'EAE', u'EOM', u'EOE', u'MSB', u'EAB', u'OOO', u'MSB', u'EPSB', u'MSB', u'MSE', u'EPSE', u'OOO']
脊柱轻度侧弯。
[u'OOO', u'OOO', u'EAB', u'OOO', u'OOO', u'EOE', u'OOO']
下胸腔见金属缝合影。
[u'EEB', u'EEM', u'EEM', u'OOO', u'OOO', u'EOB', u'EOM', u'EOE', u'MSE', u'OOO']
右侧额叶小片状低密度影，边界清。
[u'EOB', u'EOM', u'EOM', u'EOM', u'EOE', u'EAB', u'EAE', u'MSB', u'MSE', u'MWE', u'EEPE', u'OOO', u'EAB', u'EAE', u'MSB', u'EAE']
accuracy: 0.327868852459
