In [1]:
from html.parser import HTMLParser
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import LSTM, CuDNNLSTM, Dense, Input, TimeDistributed
from functools import partial
import pandas as pd
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
# Data
# -----
# - html curled from: https://bnf.nice.org.uk/drug/
# - To obtain links: only the relevent links have `.html` suffix on thier hrefs
# - To obtain drugs: only the drugs are ALL CAPS.
# - hacks
#   - Last 3 drugs obtained using the above method are no good
#   - First link obtained using the above method is no good
#   - I Amended ANTI-D (RH0) IMMUNOGLOBULIN entry in the html to remove <sub> tag

class RxParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.links = []
        self.drugs = []
    
    def handle_starttag(self, tag, attrs):
        if len(attrs) > 0 and attrs[0][0] == 'href' and attrs[0][1][-5:] == '.html':
            self.links.append(attrs[0][1])
            
    def handle_data(self, data):
        if data.isupper() and len(data) > 1:
            self.drugs.append(data)
    
    def feed(self, f):
        super().feed(f)
        return (self.drugs[:-3], self.links[1:])
        
f = open('./data/drugs.html').read()
p = RxParser()
drugs, links = p.feed(f)
assert(len(drugs) == len(links))
drugs, links = pd.Series(drugs), pd.Series(links)

# Curation
# --------
# It is possible to generate plausible drug names with fewer hidden units in the lstm
# and fewer tranining epochs by removing some of the trickier examples from the tranining
# set. For example, removing training examples which contain:
#   - brackets or commas
#   - long 'compound' drug names i.e: "x with y and z"
#   - apostrophes, there is only one: "St John's Wort"
#   - forward slashes, there are two: ADRENALINE/EPINEPHRINE and NORADRENALINE/NOREPINEPHRINE
#   - accented letter É, there are two: BACILLUS CALMETTE-GUÉRIN and BACILLUS CALMETTE-GUÉRIN VACCINE

# very selective (uncomment and comment the `quite selective section`):
# drop_idxs = drugs.str.contains(r"/|'|WITH|AND|É|,|\(", regex=True)
# drugs, links = drugs[~drop_idxs], links[~drop_idxs]

# quite selective:
drop_idxs = drugs.str.contains(r",|\(", regex=True)
drugs, links = drugs[~drop_idxs], links[~drop_idxs]

# not at all selective (don't drop any examples)
# ...


assert(len(drugs) == len(links))
print(f'number of drugs: {len(drugs)}')

# Encoding
# --------

def to_categorical(batch, num_classes):
    b, l = batch.shape
    out = np.zeros((b, l, num_classes))
    for i in range(b):
        seq = batch[i, :]
        out[i, :, :] = keras.utils.to_categorical(seq, num_classes=num_classes)
    return out

def rx_data(words):
    chars = sorted(set(''.join(words)))
    chars = ['START', 'END'] + chars
    char_idx = { ch:i for i,ch in enumerate(chars) }
    idx_char = { i:ch for i,ch in enumerate(chars) }
    print(f'number of characters: {len(chars)}')
    x = ([[char_idx[c] for c in w] for w in words])
    max_len = max([len(s) for s in x])
    print(f'longest word: {max_len}')
    x = sequence.pad_sequences(x, max_len, padding='post', value=1)
    n = x.shape[0]
    x_in = np.concatenate([np.zeros([n, 1]), x[:, :-1]], axis=1)
    x_out = x
    assert x_in.shape == x_out.shape
    x_in = to_categorical(x_in, len(chars))
    x_out = to_categorical(x_out, len(chars))
    return idx_char, max_len, chars, x_in, x_out

idx_char, max_len, chars, x_in, x_out = rx_data(drugs)
num_chars = len(chars)

number of drugs: 1555
number of characters: 38
longest word: 101


In [4]:
# Training Model
# --------------

hidden_size = 64
ins = Input(shape=[None, num_chars])
lstm = CuDNNLSTM(hidden_size, return_sequences=True, return_state=True)
hs, _, _ = lstm(ins)
dense = Dense(num_chars, activation='softmax')
outs = dense(hs)
train_model = Model(inputs=ins, outputs=outs)

# Inference Model
# ---------------

inf_model_h = Input(shape=(hidden_size,))
inf_model_c = Input(shape=(hidden_size,))
hs, h, c = lstm(ins, initial_state=[inf_model_h, inf_model_c])
outs = dense(hs) 
inf_model = Model(inputs=[ins, inf_model_h, inf_model_c],
                  outputs=[outs, h, c])

def generate():
    h0 = np.zeros([1, hidden_size])
    c0 = np.zeros([1, hidden_size])
    x0 = np.zeros([1, 1, num_chars])
    x0[0, 0, 0] = 1
    word = ''
    stop_condition = False
    while not stop_condition:
        out, h, c = inf_model.predict([x0, h0, c0])
        idx = np.random.choice(range(num_chars), p=(out[0, -1, :]))
        char = idx_char[idx]        
        if char == 'END' or len(word) > max_len:
            stop_condition = True
            break
        
        word += char
        x0 = np.zeros([1, 1, num_chars])
        x0[0,0,idx] = 1
        h0 = h
        c0 = c
        
    return word
        
def generate_print_repeat(n):
    for _ in range(n):
        print(generate())
    
            
print('pre-training samples:')
generate_print_repeat(5)

pre-training samples:
' 3KY1RZ'1OCD3PPSTARTCOC'
NMOM
BSTARTHDBYJHNTNT3YCWSSEYSTARTNZ
É- U7G'R
KW-IIVIOPY  /EFVSTARTXTY0YMMSCLFCFNS5Y7Q/ILP7VDGSTART-VEAFSXKB


In [5]:
# Train the model
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
train_model.compile(opt, 'categorical_crossentropy')
generate_stuff = keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: generate_print_repeat(5))
train_model.fit(x_in,
                x_out,
                epochs=100,
                batch_size=32,
                callbacks=[generate_stuff])

Epoch 1/100
ÉRYY'FIO
JHHDII/K1OSTART
Y0STARTC-
01ÉRQAD
1BRJAVUH0
Epoch 2/100
MGP0EVIE/M EZ3EVOO
LZMYKSU
GPQEA0CY GUU5EU3AKS
SD STARTKRPEBQ51P3NSTARTJII

Epoch 3/100
P/BZDS'LUI
IVEGESAU
P'11L/I
-X/DFCIE/B
JKJJDJEJEL7IM T
Epoch 4/100
M7RLSTART LFOEI
ÉQSN  OE PI
HPKSTART FMI EILNIOX
MÉKTIHG' NUO
V'BCEÉPITH
Epoch 5/100
DLY RH
QEVELCAON
GSTARTRNA
77 NXRIAO
HPU TTBS
Epoch 6/100
WH ÉMEAD ENSSNUPOITI A NSPIEIC
TDXULOIAOPM D
ACS77ONCQ
NF UU3HRIEEINGIEHCWTO ENDSEZULNDP
S'WAS
Epoch 7/100
WIIOIMIAAITH DH HTIRECR
STARTRTBGLOEIUOD
YNR3
XCFZYHHA
S1BOIPLIEE
Epoch 8/100
XCACIKCHON
U-LOCRAENN
/ESELMVIETGUEE RA
'1ZTT IEQLIUH
5BLA-NIISSUCT
Epoch 9/100
SXAIM3RLFMIX
AAOLYOPR
ELTA ONITRE TH NAAXR
RXGPNSBIMS
I7NLTRULTTIDTHOTE
Epoch 10/100
MAAOX'IIEN
OSUAFOIRADZ
SIWBMI CIONPPFEFE
X3RIMAET
T/RRALLIIAE
Epoch 11/100
TFOLEMRLADEI
ACDNRADIW
UBLEAUHSNB
OMPRRHCDLINOHYYHFRTLAROD
A RAIARTEIC
Epoch 12/100
VT ANEEAN 
ARSEIRE
BVATIDIMITHE  WHHADUDWIR THDRYOI EPORE
RAERETMNE
QALETICE TECO
Epoch 13/100
SIMOOIINE DFIDIWT3SID

SEL ANG INOS DETECALE
PJAPATIN SOCIOTIZADE
VALODYRAMETHANEM
TICHODIBE
LORNIPAMYBE
Epoch 50/100
ACETACIT CICACITE
CALSINE HYDROCHLORIDE
SFOMEN CYSIN MYDRIDOLH HYDROCHLORIDEVE
DINOPROMINE
ETIMENUL
Epoch 51/100
RARRECYLONE EDULFACEL
LIDIUMOLE
STARTETHROPHYDRONE FROCROCIDE
CLCORSIDE MIDE HESTRATE
FORSITEMIN
Epoch 52/100
DIOTLIMIDINUM BAROSUNATE
OVIRSOUM CIOMIN TAMOL ABENATENOS DROC STIMINE
TANOCHIPE
ADKENAZUMLACEN
TERFINROMETA
Epoch 53/100
DISTARAMET
CHIDOPEONANE WITH MENESARIFE
MAKENIS
EROINILOL
VERNOZAVIDR WITH OMERTENRATE
Epoch 54/100
FLENOGSINE
PENAGERASTER
CHPHOSOSOL AND CONEGONAZIDE
DADOCULINOL WITH SORABUCAL
LETHILACACATINE
Epoch 55/100
LO-HYDROCOPR SIDAFE
CIRFFLOTABATE
ENMOSPIRM
CILULIMAB
ENOPMEXINE
Epoch 56/100
CUFLAMATIN SUD
GERI-TACOST SON WITH METOSTATEIN
FLOFENINS WITH LENDOGREPLER
EPENAVIR AL HYDROCHLORET
BETHENOXOMINEM
Epoch 57/100
TAMALFAVIR
CALSODIUN WITH ELCHLORETATIE
MEZLAZOMEIN LIGRON
BUPOVERAMILE CORODORINA MDULUTASTARAMI
TANVILASIUM HYDROCHLERIDE
Epoch 58/100
LETERIM 

FLUDOPENONISIUM CHLORIDE
COYDAUM COLRIDES
Epoch 95/100
TRAZINAMOCL
-ARMECORTIS  WITH TESTRAZINE
HYPRODLOTEANE SODIUM AMB ENSULIN AND PENSOIX SPALIN
MERGREXTAM
TOHALYSTRAMIDE
Epoch 96/100
FILCINCACIN 
HYDROCHROZIDIZAMENE
DUPENABRAMENIDE BROMIDE
LISOLININM
LOLPAPRILY
Epoch 97/100
PROCAFUNAVIR
FERVOITAC INK CANCABILUMARB
LINOSPRIN
METIPROPROLN FERAQU3
EVALINODINE
Epoch 98/100
KETIMOL PHLARODE
PALANGIL PERAMID ALDISUMAB
HLEPRAZAPETENOL HYDROCHLORIDE
HYDROROCHLORIDOATINE
FLEVORTERY1TEROST
Epoch 99/100
PATIPINA
XINODIOLE
FENORUMINE
WITHIMOL WITH CLISTERAM
ALCINABONE
Epoch 100/100
RASULIGUPRAS
BITROPIOL LUMDITRABEN
GEMESORAM
MENEGLICLIN GHYCCLORE
BUCIDABOLIN


<keras.callbacks.History at 0x7f5f785e8a20>

In [6]:
# Back of napkin acceptance tests
# -------------------------------

examples = [generate() for i in range(1000)]

# eyeball some examples
for e in examples[:25]:
    print(e)
    
# 1. Don't generate names that are in the training set.
# 2. Don't start or end with 'AND' or 'WITH'

# Accept 0.005% regurgitation of training set
# Accept 0.005% grammatically incorrect examples - that start with 'AND' or 'WITH'

# Empirically these thresholds result in: 
# - generatation of strong plausible names without too much training set regurgitation
# - captures interesting single-word patters i.e: class suffixes like: `ate`, `lol` and `one`
# - captures interesting multi-word patterns i.e: 'x with y and z' and salts like 'X hydrochloride'

real = set(drugs).intersection(set(examples))
if len(real) > 0:
    print('----')
    print('training set regurgitation:')
    print(real)
assert(len(real) < 5)

example_series = pd.Series(examples)
a = example_series.str.startswith('WITH').sum()
b = example_series.str.endswith('WITH').sum()
c = example_series.str.endswith('AND').sum()
d = example_series.str.startswith('AND').sum()
tot = a + b + c + d
if tot > 0:
    print('----')
    print('number of and/with start/ends:')
    print(tot)
assert(tot < 5)


AMOLOLIDE
RONIFUMOL
ITUMINOLINE
SUPRICASOIN HYDROCHORODIDE ANDOCOUMAB
ALEZUMIN
SODICYLCINE WITH SEMICYLBINE
DESSURUC UMIDONOSTOGRATE WITH VICOLBINE
PROPIFEDINAZEGEN ACETATE
BINZULAST
SIDOL GUSTETRALE
AGLOCOLIN WITH NECLIRIDINE
DUPEROMUB
PROCETINONE WITH ERTRICLIPRADENY
BRITAPENTAIN WITH METYCOXYLANE BOMIBETON
NOXEZIDONE
IXYCCTINAZINE HYDROCHLORIDE
ISONALTROMID UMIR TANTROPIN WITH CICLOZOPIAMON AND TROMYCLOTRONION
TRANICULINE HYDROCHLORIDE
ALCINUL
CAGCUPIVAC AND ESTRIPLOSONIDE
CALSSIUM BUDONILIDE
DIUL SORENTRATE
EGLICULOCAT
OTHYRAC IDE WITH NOFETIN
CIHTHATINE
----
number of and/with start/ends:
1


In [8]:
# The destination for this model is tensorflow.js, which does not deal with CuDNNLSTM layers.
# Here, I do a little dance to load the weights from the CuDNNLSTM into an LSTM layer before
# serializing the model, so that I can convert it to tensorflow.js layers format downstream.

train_model.save_weights('./models/keras/train_weights.model')
hidden_size = 64
hack_ins = Input(shape=[None, num_chars])
hack_lstm = LSTM(hidden_size, return_sequences=True, return_state=True)
hack_hs, _, _ = hack_lstm(hack_ins)
hack_dense = Dense(num_chars, activation='softmax')
hack_outs = hack_dense(hack_hs)
hack_train_model = Model(inputs=hack_ins, outputs=hack_outs)
hack_train_model.load_weights('./models/keras/train_weights.model')
hack_train_model.save('./models/keras/train_model.h5')