In [85]:
from html.parser import HTMLParser
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import CuDNNLSTM, Dense, Input, TimeDistributed
import pandas as pd
import numpy as np
import keras

In [81]:
# Data
# -----
# - html curled from: https://bnf.nice.org.uk/drug/
# - To obtain links: only the relevent links have `.html` suffix on thier hrefs
# - To obtain drugs: only the drugs are ALL CAPS.
# - hacks
#   - Last 3 drugs obtained using the above method are no good
#   - First link obtained using the above method is no good
#   - I Amended ANTI-D (RH0) IMMUNOGLOBULIN entry in the html to remove <sub> tag

class RxParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.links = []
        self.drugs = []
    
    def handle_starttag(self, tag, attrs):
        if len(attrs) > 0 and attrs[0][0] == 'href' and attrs[0][1][-5:] == '.html':
            self.links.append(attrs[0][1])
            
    def handle_data(self, data):
        if data.isupper() and len(data) > 1:
            self.drugs.append(data)
    
    def feed(self, f):
        super().feed(f)
        return (self.drugs[:-3], self.links[1:])
        
f = open('./data/drugs.html').read()
p = RxParser()
drugs, links = p.feed(f)
assert(len(drugs) == len(links))
drugs, links = pd.Series(drugs), pd.Series(links)

# Curation
# --------
# - dropping drugs with:
#   - brackets or commas
#   - long 'compound' drug names i.e: "x with y and z"
#   - apostrophes, there is only one: "St John's Wort"
#   - forward slashes, there are two: ADRENALINE/EPINEPHRINE and NORADRENALINE/NOREPINEPHRINE
#   - accented letter É, there are two: BACILLUS CALMETTE-GUÉRIN and BACILLUS CALMETTE-GUÉRIN VACCINE

drop_idxs = drugs.str.contains(r"/|'|WITH|AND|É|,|\(", regex=True)
drugs, links = drugs[~drop_idxs], links[~drop_idxs]
assert(len(drugs) == len(links))
print(f'number of drugs: {len(drugs)}')

# Encoding
# --------

def to_categorical(batch, num_classes):
    b, l = batch.shape
    out = np.zeros((b, l, num_classes))
    for i in range(b):
        seq = batch[i, :]
        out[i, :, :] = keras.utils.to_categorical(seq, num_classes=num_classes)
    return out

def rx_data(words):
    chars = sorted(set(''.join(words)))
    chars = ['START', 'END'] + chars
    char_idx = { ch:i for i,ch in enumerate(chars) }
    idx_char = { i:ch for i,ch in enumerate(chars) }
    print(f'number of characters: {len(chars)}')
    x = ([[char_idx[c] for c in w] for w in words])
    max_len = max([len(s) for s in x])
    print(f'longest word: {max_len}')
    x = sequence.pad_sequences(x, max_len, padding='post', value=1)
    n = x.shape[0]
    x_in = np.concatenate([np.zeros([n, 1]), x[:, :-1]], axis=1)
    x_out = x
    assert x_in.shape == x_out.shape
    x_in = to_categorical(x_in, len(chars))
    x_out = to_categorical(x_out, len(chars))
    return idx_char, max_len, chars, x_in, x_out

idx_char, max_len, chars, x_in, x_out = rx_data(drugs)
num_chars = len(chars)

number of drugs: 1281
number of characters: 34
longest word: 40


In [86]:
# Training Model
# --------------

hidden_size = 10
ins = Input(shape=[None, num_chars])
lstm = CuDNNLSTM(10, return_sequences=True, return_state=True)
hs, _, _ = lstm(ins)
dense = Dense(num_chars, activation='softmax')
outs = dense(hs)
train_model = Model(inputs=ins, outputs=outs)

# Inference Model
# ---------------

inf_model_h = Input(shape=(hidden_size,))
inf_model_c = Input(shape=(hidden_size,))
hs, h, c = lstm(ins, initial_state=[inf_model_h, inf_model_c])
outs = dense(hs) 
inf_model = Model(inputs=[ins, inf_model_h, inf_model_c],
                  outputs=[outs, h, c])

def generate():
    h0 = np.zeros([1, hidden_size])
    c0 = np.zeros([1, hidden_size])
    x0 = np.zeros([1, 1, num_chars])
    x0[0, 0, 0] = 1
    word = ''
    stop_condition = False
    while not stop_condition:
        out, h, c = inf_model.predict([x0, h0, c0])
        idx = np.random.choice(range(num_chars), p=(out[0, -1, :]))
        char = idx_char[idx]        
        if char == 'END' or len(word) > max_len:
            stop_condition = True
            break
        
        word += char
        x0 = np.zeros([1, 1, num_chars])
        x0[0,0,idx] = 1
        h0 = h
        c0 = c
        
    print(word)
    return word


def generate_n(n):
    for _ in range(n):
        generate()
        
generate_n(5)

InvalidArgumentError: No OpKernel was registered to support Op 'CudnnRNN' with these attrs.  Registered devices: [CPU], Registered kernels:
  <no registered kernels>

	 [[Node: cu_dnnlstm_1/CudnnRNN = CudnnRNN[T=DT_FLOAT, direction="unidirectional", dropout=0, input_mode="linear_input", is_training=true, rnn_mode="lstm", seed=87654321, seed2=0](cu_dnnlstm_1/transpose, cu_dnnlstm_1/ExpandDims_1, cu_dnnlstm_1/ExpandDims_2, cu_dnnlstm_1/concat_1)]]

Caused by op 'cu_dnnlstm_1/CudnnRNN', defined at:
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-86-a102b1e1dc51>", line 7, in <module>
    hs, _, _ = lstm(ins)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/keras/layers/recurrent.py", line 532, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/keras/engine/base_layer.py", line 457, in __call__
    output = self.call(inputs, **kwargs)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/keras/layers/cudnn_recurrent.py", line 90, in call
    output, states = self._process_batch(inputs, initial_state)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/keras/layers/cudnn_recurrent.py", line 517, in _process_batch
    is_training=True)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1544, in __call__
    input_data, input_h, input_c, params, is_training=is_training)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1435, in __call__
    seed=self._seed)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 922, in _cudnn_rnn
    outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/python/ops/gen_cudnn_rnn_ops.py", line 115, in cudnn_rnn
    is_training=is_training, name=name)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/home/paperspace/miniconda3/envs/rx/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'CudnnRNN' with these attrs.  Registered devices: [CPU], Registered kernels:
  <no registered kernels>

	 [[Node: cu_dnnlstm_1/CudnnRNN = CudnnRNN[T=DT_FLOAT, direction="unidirectional", dropout=0, input_mode="linear_input", is_training=true, rnn_mode="lstm", seed=87654321, seed2=0](cu_dnnlstm_1/transpose, cu_dnnlstm_1/ExpandDims_1, cu_dnnlstm_1/ExpandDims_2, cu_dnnlstm_1/concat_1)]]


In [83]:
# Train the model
opt = keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9)
train_model.compile(opt, 'categorical_crossentropy')
generate_stuff = keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: generate_n(5))
train_model.fit(x_in,
                x_out,
                #validation_split=1/10,
                epochs=20,
                batch_size=1,
                callbacks=[generate_stuff])

Epoch 1/20
COCAFINTRTEBE
UCANZIDAZORE
JLERRITINE 3PYLIMAL
DIN RIMITOLAL
GSOMBORIPONEAFIN
Epoch 2/20
XOTENORAMIUM
CIFFAANE
ETERGOLIME
SROSFERAMADE
FOROROFLONE
Epoch 3/20
BUCFIULUM
TEHPUIM SELAME
SLUSUSIB
F-LAZACASTE
SELABATIS
Epoch 4/20
ZINERORTIFE SUMOLE
MEMAICAFA
PROVODOBOL VHYCARTE
TERCLASINE
CAPLOBHURINE
Epoch 5/20
 231/1281 [====>.........................] - ETA: 39s - loss: 0.7774

KeyboardInterrupt: 

In [3]:
#train_model.save('.models/train_model/model.h5')
#inf_model.save('.models/inf_model/model.h5')

In [157]:
generate_n(5)

SOISAMINE BODRODE
TATININAZ FLORINE HAALB
ADIPTIDE
NGRLOTERER TRIONI-IDE
FROTINUZONINOPE
