In [3]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #.+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [51]:
df = pd.read_csv("asanew.csv")
#df.groupby('code').nunique()
#labels = df['code']
#text = df['text']
#labels.nunique()
#text
#text1 = text.apply(clean_text)
#text1

In [52]:
df.head()

Unnamed: 0,text,code
0,1 by mouth daily,1QD
1,1 by mouth every day,1QD
2,1 daily,1QD
3,1 (one) Tablet by mouth daily,1QD
4,"1 (one) Tablet, Oral, daily",1QD


In [53]:
vocabulary_size = 10000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=50)

In [57]:
model_lstm = Sequential()
model_lstm.add(Embedding(20000, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [58]:
df['code']

0       1QD                 
1       1QD                 
2       1QD                 
3       1QD                 
4       1QD                 
5       1QD                 
6       1QD                 
7       1QD                 
8       1QD                 
9       1QD                 
10      1QD                 
11      1QD                 
12      1QD                 
13      1QD                 
14      1QD                 
15      1QD                 
16      1QD                 
17      1QD                 
18      1QD                 
19      1QD                 
20      1QD                 
21      1QD                 
22      1QD                 
23      1QD                 
24      1QD                 
25      1QD                 
26      1QD                 
27      1QD                 
28      1QD                 
29      1QD                 
                ...         
3970    UAD                 
3971    1QD30D              
3972    1QD                 
3973    1QD   

In [59]:
model_lstm.fit(data, np.array(labels), validation_split=0.4, epochs=3)

Train on 2400 samples, validate on 1600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1d9a8491e10>

In [60]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

In [61]:
model_conv = create_conv_model()
model_conv.fit(data, np.array(labels), validation_split=0.4, epochs = 3)

Train on 2400 samples, validate on 1600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1d9a8698d68>

In [63]:
embeddings_index = dict()
f = open('glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [64]:
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [65]:
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [66]:
model_glove.fit(data, np.array(labels), validation_split=0.4, epochs = 3)

Train on 2400 samples, validate on 1600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1d9a86647b8>