In [119]:
from html.parser import HTMLParser
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import LSTM, Dense, Input, TimeDistributed
import pandas as pd
import numpy as np
import keras

In [131]:
# hacks
# - scraped from: https://bnf.nice.org.uk/drug/
# - To obtain links: use .html suffic on hrefs
# - To obtain drugs: use all caps.
# - Last 3 drugs no good
# - First link no good
# - Amended ANTI-D (RH0) IMMUNOGLOBULIN entry in the html to remove <sub> tag

class RxParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.links = []
        self.drugs = []
    
    def handle_starttag(self, tag, attrs):
        if len(attrs) > 0 and attrs[0][0] == 'href' and attrs[0][1][-5:] == '.html':
            self.links.append(attrs[0][1])
            
    def handle_data(self, data):
        if data.isupper() and len(data) > 1:
            self.drugs.append(data)
    
    def feed(self, f):
        super().feed(f)
        return (self.drugs[:-3], self.links[1:])
        
f = open('./data/drugs.html').read()
p = RxParser()
drugs, links = p.feed(f)
assert(len(drugs) == len(links))
drugs, links = pd.Series(drugs), pd.Series(links)

In [137]:
drugs[drugs.str.contains('É')]
drugs[drugs.str.contains('\'')]
drugs[drugs.str.contains('/')]

27            ADRENALINE/EPINEPHRINE
1081    NORADRENALINE/NOREPINEPHRINE
dtype: object

In [138]:
drop_idxs = drugs.str.contains('WITH')
drugs, links = drugs[~drop_idxs], links[~drop_idxs]
assert(len(drugs) == len(links))

In [139]:
def to_categorical(batch, num_classes):
    b, l = batch.shape
    out = np.zeros((b, l, num_classes))
    for i in range(b):
        seq = batch[i, :]
        out[i, :, :] = keras.utils.to_categorical(seq, num_classes=num_classes)
    return out

def rx_data(words):
    chars = sorted(set(''.join(words)))
    chars = ['START', 'END'] + chars
    char_idx = { ch:i for i,ch in enumerate(chars) }
    idx_char = { i:ch for i,ch in enumerate(chars) }
    print(len(chars))
    x = ([[char_idx[c] for c in w] for w in words])
    max_len = max([len(s) for s in x])
    print(max_len)
    x = sequence.pad_sequences(x, max_len, padding='post', value=1)
    n = x.shape[0]
    x_in = np.concatenate([np.zeros([n, 1]), x[:, :-1]], axis=1)
    x_out = x
    assert x_in.shape == x_out.shape
    x_in = to_categorical(x_in, len(chars))
    x_out = to_categorical(x_out, len(chars))
    return idx_char, max_len, chars, x_in, x_out

idx_char, max_len, chars, x_in, x_out = rx_data(drugs)
num_chars = len(chars)

40
68


In [140]:
hidden_size = 10
ins = Input(shape=[None, num_chars])
lstm = LSTM(10, return_sequences=True, return_state=True)
hs, _, _ = lstm(ins)
dense = Dense(num_chars, activation='softmax')
outs = dense(hs)
train_model = Model(inputs=ins, outputs=outs)

inf_model_h = Input(shape=(hidden_size,))
inf_model_c = Input(shape=(hidden_size,))
hs, h, c = lstm(ins, initial_state=[inf_model_h, inf_model_c])
outs = dense(hs) 
inf_model = Model(inputs=[ins, inf_model_h, inf_model_c],
                  outputs=[outs, h, c])


def generate():
    h0 = np.zeros([1, hidden_size])
    c0 = np.zeros([1, hidden_size])
    x0 = np.zeros([1, 1, num_chars])
    x0[0, 0, 0] = 1
    word = ''
    stop_condition = False
    while not stop_condition:
        out, h, c = inf_model.predict([x0, h0, c0])
        #idx = np.argmax(out[0, -1, :])
        idx = np.random.choice(range(num_chars), p=(out[0, -1, :]))
        char = idx_char[idx]        
        if char == 'END' or len(word) > max_len:
            stop_condition = True
            break
        
        word += char
        x0 = np.zeros([1, 1, num_chars])
        x0[0,0,idx] = 1
        h0 = h
        c0 = c
        
    print(word)
    return word


def generate_n(n):
    for _ in range(n):
        generate()
        
generate_n(5)

GRMSTARTÉ/-SI(IJ5'X0(PYMAS(QWI
OSTARTX
' A5ZL5KIFDBA1STARTO X5XYY1YG,Q-I5J3XUPJSTARTGD-FQNIV-CLW/FQR)WCRL1J
RCXYVSTART,0EJJC5UEJLQBPU3UWXN1 HV-TZK(S)OSTARTLSTARTNGG,V5EA-HOIHZDD
50I1ZTT,E


In [171]:
generate_n(5)

TAZRINEUM
ENSOSS
VALLOTON CALBOGLULE
CROASGIG, (HLYLORHER
ABETOSE HACLICA


In [142]:
# Train the model
opt = keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9)
train_model.compile(opt, 'categorical_crossentropy')
generate_stuff = keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: generate_n(5))
train_model.fit(x_in,
                x_out,
                #validation_split=1/10,
                epochs=20,
                batch_size=1,
                callbacks=[generate_stuff])

Epoch 1/20
ALOTATIE
IGARKITI  DULIU
PUEFEVLINE
LACPODIMINEMATE
ROCAMEIME
Epoch 2/20
CHZLATIBE BLOL
APIFADILASENEHDERDARIONAMIASAIEMATRYDESACIROBANM POVTADEMETTALIN OLRI
MURIUCTIM(ORAN ENYYDOMOTOMEMHIDE
LOALMUGCARLORUREDE
VOMILINABAMONE ,INONORATITATIE
Epoch 3/20
CALRAYCROROLDAN )ACTOL
BIBUUCATEANE
EZEPOSNIUS
MECREXANS
MUPEHUCALM
Epoch 4/20
ZESOLMOBUET)
FELHAZINE PAVATE
CIXARIN PHOTOREST ALIPIN
VONTRAMISE SOMONETHINUTA
EPGENMUCUT ACSM
Epoch 5/20
IFUMOCRENG
OUXPEDINE
OLONIDIDIN
HIGRORONE
KINNINEVAT (ADIMISUBTODE
Epoch 6/20
TODAZESTE PYRMIDE
TRAPEVOLIS
TRIDINBOCTATE
PASCRIMETIN
APAPIL
Epoch 7/20
CALCSIC ACTAM
BEZROLAZTRIDE
BINENINFGLYNIUTEGIDED
CHOLISOAL DING
PARE
Epoch 8/20
TREASTAZIME
PTIMONZOXEDPHYNELIFE
UVISACINT DOPROE
SITIPRAM EXICHRACA SACMLETENTAMILETTROSINE INEC ROMICACIPE
MYVECHCHLODINE
Epoch 9/20

KeyboardInterrupt: 

In [3]:
#train_model.save('.models/train_model/model.h5')
#inf_model.save('.models/inf_model/model.h5')

In [157]:
generate_n(5)

SOISAMINE BODRODE
TATININAZ FLORINE HAALB
ADIPTIDE
NGRLOTERER TRIONI-IDE
FROTINUZONINOPE
