In [56]:
from html.parser import HTMLParser
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import CuDNNLSTM, Dense, Input, TimeDistributed
from functools import partial
import pandas as pd
import numpy as np
import keras

In [92]:
# Data
# -----
# - html curled from: https://bnf.nice.org.uk/drug/
# - To obtain links: only the relevent links have `.html` suffix on thier hrefs
# - To obtain drugs: only the drugs are ALL CAPS.
# - hacks
#   - Last 3 drugs obtained using the above method are no good
#   - First link obtained using the above method is no good
#   - I Amended ANTI-D (RH0) IMMUNOGLOBULIN entry in the html to remove <sub> tag

class RxParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.links = []
        self.drugs = []
    
    def handle_starttag(self, tag, attrs):
        if len(attrs) > 0 and attrs[0][0] == 'href' and attrs[0][1][-5:] == '.html':
            self.links.append(attrs[0][1])
            
    def handle_data(self, data):
        if data.isupper() and len(data) > 1:
            self.drugs.append(data)
    
    def feed(self, f):
        super().feed(f)
        return (self.drugs[:-3], self.links[1:])
        
f = open('./data/drugs.html').read()
p = RxParser()
drugs, links = p.feed(f)
assert(len(drugs) == len(links))
drugs, links = pd.Series(drugs), pd.Series(links)

# Curation
# --------
# It is possible to generate plausible drug names with fewer hidden units in the lstm
# and fewer tranining epochs by removing some of the trickier examples from the tranining
# set. For example, removing training examples which contain:
#   - brackets or commas
#   - long 'compound' drug names i.e: "x with y and z"
#   - apostrophes, there is only one: "St John's Wort"
#   - forward slashes, there are two: ADRENALINE/EPINEPHRINE and NORADRENALINE/NOREPINEPHRINE
#   - accented letter É, there are two: BACILLUS CALMETTE-GUÉRIN and BACILLUS CALMETTE-GUÉRIN VACCINE

# very selective (uncomment and comment the `quite selective section`):
# drop_idxs = drugs.str.contains(r"/|'|WITH|AND|É|,|\(", regex=True)
# drugs, links = drugs[~drop_idxs], links[~drop_idxs]

# quite selective:
drop_idxs = drugs.str.contains(r",|\(", regex=True)
drugs, links = drugs[~drop_idxs], links[~drop_idxs]

# not at all selective (don't drop any examples)
# ...


assert(len(drugs) == len(links))
print(f'number of drugs: {len(drugs)}')

# Encoding
# --------

def to_categorical(batch, num_classes):
    b, l = batch.shape
    out = np.zeros((b, l, num_classes))
    for i in range(b):
        seq = batch[i, :]
        out[i, :, :] = keras.utils.to_categorical(seq, num_classes=num_classes)
    return out

def rx_data(words):
    chars = sorted(set(''.join(words)))
    chars = ['START', 'END'] + chars
    char_idx = { ch:i for i,ch in enumerate(chars) }
    idx_char = { i:ch for i,ch in enumerate(chars) }
    print(f'number of characters: {len(chars)}')
    x = ([[char_idx[c] for c in w] for w in words])
    max_len = max([len(s) for s in x])
    print(f'longest word: {max_len}')
    x = sequence.pad_sequences(x, max_len, padding='post', value=1)
    n = x.shape[0]
    x_in = np.concatenate([np.zeros([n, 1]), x[:, :-1]], axis=1)
    x_out = x
    assert x_in.shape == x_out.shape
    x_in = to_categorical(x_in, len(chars))
    x_out = to_categorical(x_out, len(chars))
    return idx_char, max_len, chars, x_in, x_out

idx_char, max_len, chars, x_in, x_out = rx_data(drugs)
num_chars = len(chars)

number of drugs: 1555
number of characters: 38
longest word: 101


In [119]:
# Training Model
# --------------

hidden_size = 64
ins = Input(shape=[None, num_chars])
lstm = CuDNNLSTM(hidden_size, return_sequences=True, return_state=True)
hs, _, _ = lstm(ins)
dense = Dense(num_chars, activation='softmax')
outs = dense(hs)
train_model = Model(inputs=ins, outputs=outs)

# Inference Model
# ---------------

inf_model_h = Input(shape=(hidden_size,))
inf_model_c = Input(shape=(hidden_size,))
hs, h, c = lstm(ins, initial_state=[inf_model_h, inf_model_c])
outs = dense(hs) 
inf_model = Model(inputs=[ins, inf_model_h, inf_model_c],
                  outputs=[outs, h, c])

def generate():
    h0 = np.zeros([1, hidden_size])
    c0 = np.zeros([1, hidden_size])
    x0 = np.zeros([1, 1, num_chars])
    x0[0, 0, 0] = 1
    word = ''
    stop_condition = False
    while not stop_condition:
        out, h, c = inf_model.predict([x0, h0, c0])
        idx = np.random.choice(range(num_chars), p=(out[0, -1, :]))
        char = idx_char[idx]        
        if char == 'END' or len(word) > max_len:
            stop_condition = True
            break
        
        word += char
        x0 = np.zeros([1, 1, num_chars])
        x0[0,0,idx] = 1
        h0 = h
        c0 = c
        
    return word
        
def generate_print_repeat(n):
    for _ in range(n):
        print(generate())
    
            
print('pre-training samples:')
generate_print_repeat(5)

pre-training samples:
B5QPLH
LÉU0MB'V3WDGVKGRXLERWMSTART1730T1F-LFV7 

EÉYDDFKS
ATDW J 7


In [120]:
# Train the model
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
train_model.compile(opt, 'categorical_crossentropy')
generate_stuff = keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: generate_print_repeat(5))
train_model.fit(x_in,
                x_out,
                epochs=100,
                batch_size=32,
                callbacks=[generate_stuff])

Epoch 1/100

F7VUR/
BÉWÉ/1M
STARTSTARTZZBTM
1UKFU57EBB
Epoch 2/100
KI5E 
1XLÉ0HSTART7M

XSL-I577MÉ5H
FKX3
Epoch 3/100
1DRRRHAA5UAOIFTT
J /NJSTARTAZRDI7D ADOL
5
XÉ
RUI5ÉOIMAP
Epoch 4/100
TIN
GDSTART DCJE7XVTGAO
5FDZ1ITHCE
B7NLXNBL 
GETOCEFRIOBGE
Epoch 5/100
'STRAUO RMY
ANRMIWUERIZHT
D
Y WKVO0
/START/1AGKOUVADASM
Epoch 6/100
ITEIEHLRLAOE AEABA
VNRC E 5
ZI3LZINZ
 -'CYHTZTTT AE
STARTPAONTPNO EESTARTMTAYI0MOIRCR
Epoch 7/100
A

ZHKVEAHNPERXI
1É7NRFUEHI
WÉVOREOOCSAAIE
Epoch 8/100
NN0/RDOFRIOTPRAH
SOTSON
UBR7ÉNAINTHIOBRDHETROPTMAICARUE
1S PIAAOHLAE YC
O
Epoch 9/100
AT1OCAIC TR
RAJIUHEDL
V EEIIILIOWROEBNEE
DAKONNOMR OAIUDE NEME LEMIIDPMI
-INLNUANINA
Epoch 10/100
UON5ODOHRHMBOPSA
BRERCONDSIAOTNI
XYUROLLOED
IVOL TEDISOPU IPDECLCAC OSEINU UINFN OO COJCIDMI

Epoch 11/100
 VNWLINOSANSZEXD
CTRIPTINM
RSBCAITES
L5IEAIINN
CPOTLCALOOO
Epoch 12/100
NJIEPCIAE CLITESE
JUASONUÉETSTARTI
ACRAL
YAFEHCRLOSIU
LJ7CW
Epoch 13/100
URDOTMICOEE
UIL OORITEF
LEMOCBBECCCITA
KNOMYDOSI 
OZARAPMUENE
Epoch 14/100
FIBOFPRERIE

TROXIXOSTABINE HYDROGROLIGLINE
TAFARCATINE
ITEFASCORINE
ICETIFLORAC HLOAME
DACPOTASOLU DIZACIB
Epoch 52/100
FOLUMASTRINE HYDRODRIONE
NROMEN AZIFE AFOC ACTE
PATROPAZDICEL
ETHOE SENATE
LOMRETAMIDE
Epoch 53/100
PEACHYTRYPROSIDE
BERBINCAMOL
YOBINCOLBONE
METOSLAZIXE
CAMOGSIDIN
Epoch 54/100
METHLONYDEPHLORUMAC
CLOOFANVIR
CALBENORIDE
LILFONAIDE
CYNCTOCIL
Epoch 55/100
XIAVIDROLE
ESTURAZOL ACDIDE
RANBIPAMI
PARISEID WITH COLMACATOC ACIGL WITH PRETTOMYCYL NE
ESTISOTER MIDROSUVIR
Epoch 56/100
CASONFULOC LABATIDE
UVINATE-OTRIDOL WITH GLOMEUN HYDROCHLORESITID CAICATI
ULMYDICOLOL WITH ULUMLATHPAME
ACLENMINE HYDROCHLORIDE
DOSENEONITEXYSOC DIUM CHLORID WITH SEPARAFN
Epoch 57/100
POCALDIORI
SALITRARIDONE CEDITIN
REBOTHAMIB
TAMXETOLID
CAFERABINE
Epoch 58/100
VIRETAMATETE
DPEDAPNOMILE
CACCOFERAM
PORINOMEROL
THODOAZINE
Epoch 59/100
PIVOFLORTENAL
BITALSTAT
RWATOMILEAFENE
DOPAZOLINE TYRPOHYDROLE
NOFYLERONONES
Epoch 60/100
ISDENAXIN HYDRACHLORIDE
CIRCOLOXIN
PREPHADIBETE
POELUFTRAZOL
IDOSTAMILE WITH COLIQULOL


TRENTASTENE
VIRAZENAS
DISOTIUM BROPATE
HESTROPTODONE WITH ALECOID BANCATE
TANSIOL
Epoch 99/100
FLOXYPRAZONE DEOSIUM
MEVERTLIBANES
SEDICOLCATE
SODIUM WITH TPONOSUFAR ARTFONID
LIGELZIDE
Epoch 100/100
TROAMILONE DIXOMUCYL
LACLOMYCTROL YDRECATRELANE AND CO-TIVOCIUM AL HYDRSTHENTAG
IBUCALPIDINE WITH BOFESIN XACTETA WITH BIMAQUINE
ETRAMLITIM
METARBOTUMIB


<keras.callbacks.History at 0x7f532e824a20>

In [127]:
# Back of napkin acceptance tests
# -------------------------------

examples = [generate() for i in range(1000)]

# eyeball some examples
for e in examples[:25]:
    print(e)
    
# 1. Don't generate names that are in the training set.
# 2. Don't start or end with 'AND' or 'WITH'

# Accept 0.005% regurgitation of training set
# Accept 0.005% grammatically incorrect examples - that start with 'AND' or 'WITH'

# Empirically these thresholds reult in: 
# - generatation of strong plausible names
# - captures interesting single-word patters i.e: class suffixes like: `ate`, `lol` and `one`
# - captures interesting multi-word patterns i.e: 'x with y and z' and salts like 'X hydrochloride'

real = set(drugs).intersection(set(examples))
if len(real) > 0:
    print('----')
    print('training set regurgitation:')
    print(real)
assert(len(real) < 5)

example_series = pd.Series(examples)
a = example_series.str.startswith('WITH').sum()
b = example_series.str.endswith('WITH').sum()
c = example_series.str.endswith('AND').sum()
d = example_series.str.startswith('AND').sum()
tot = a + b + c + d
if tot > 0:
    print('----')
    print('number of and/with start/ends:')
    print(tot)
assert(tot < 5)


ACLIMOTOL WITH FORMENTROL WITH PINTRIGUNIC ACID
DACLOTEROAL WITH TEVICINZOLAB
METRAMEZEL
TAFLUOTASTE
VALLIGUXTRIN
FALRGENAZOLE
GELPOED IVARTAN ENSOFEND
ATIRANIN
ILINUDINE
COBIAM E DICLOTERDATE 
ILFURATIDE
SOPRENIUM HLOPIRATE AND DROMICLIRONE HYDYCOLLOLOBE
RACITHIUM BROCETAL
ELTEXTRABLONE
CETICARER
LONHEXETIN MYDETANE
COLLIDESOLINITETANGE
ETHIZUDEPATIMUBNIN WITH PRICOLAMINE HYDROCHLORIDE
FOLUTAMIDE
SOLCITA CITA
CEPROXIMOL
TEGOPROCIN
LIDOPRAMIDE WITH ALEBETHYRCENE CLLORIDE
DAPROSBINE
LUCOSIDINE MODEOTE
----
number of and/with start/ends:
2


In [128]:
train_model.save('./models/train_model/model.h5')
inf_model.save('./models/inf_model/model.h5')

OSError: Unable to create file (unable to open file: name = './models/train_model/model.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)