<a href="https://colab.research.google.com/github/danielsyahputra13/ml_capstone/blob/master/notebooks/MaxPooling1D_Modelling_Mardi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/Shareddrives/ML-Capstone/dataset/train.csv")
test = pd.read_csv("/content/drive/Shareddrives/ML-Capstone/dataset/test.csv")

In [None]:
data.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,clean_review
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,effect combin bystol mg fish oil
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,son halfway fourth week intuniv concern began ...
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,oral contracept pill cycl happi light period m...
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,time form birth control m glad went patch mont...
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,suboxon complet turn life feel healthier m exc...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160383 entries, 0 to 160382
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   drugName      160383 non-null  object 
 1   condition     160383 non-null  object 
 2   review        160383 non-null  object 
 3   rating        160383 non-null  float64
 4   date          160383 non-null  object 
 5   usefulCount   160383 non-null  int64  
 6   clean_review  160383 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 8.6+ MB


In [None]:
data['num_of_words'] = data['clean_review'].apply(lambda x: len(nltk.word_tokenize(x)))

In [None]:
data['num_of_words'].describe()

count    160383.000000
mean         36.348497
std          19.627082
min           1.000000
25%          21.000000
50%          36.000000
75%          53.000000
max         849.000000
Name: num_of_words, dtype: float64

In [None]:
data  = data[data['condition'] != 'me']
data = data[data['condition'] != 'mist (']

In [None]:
data = data[~data['condition'].str.contains('</span>')]

In [None]:
test  = test[test['condition'] != 'me']
test = test[test['condition'] != 'mist (']
test = test[~test['condition'].str.contains('</span>')]

In [None]:
test = test[test['condition'].isin(data['condition'])]

# Model


In [None]:
vocab_size = 5000
embedding_dim = 64
max_length = 250
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8
# training_portion = .9

Split data train vs val

In [None]:
# from sklearn.model_selection import train_test_split

# train, val = train_test_split(data, test_size=0.2)
train = data.copy()
test = test.copy()

In [None]:
print("Train size:", len(train))

Train size: 159481


In [None]:
print("Testing size:", len(test))

Testing size: 53156


In [None]:
train_reviews = train['clean_review'].tolist()
test_reviews = test['clean_review'].tolist()

## Transformasi data train

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_reviews)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'day': 3,
 'effect': 6,
 'm': 10,
 'month': 4,
 'start': 9,
 't': 2,
 'week': 8,
 'work': 7,
 'year': 5}

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_reviews)
print(train_sequences[10])

[23, 8, 9, 17, 7, 125, 17, 362, 17, 60, 79, 49, 1496, 110, 8, 89, 85, 79, 889, 365, 525, 1290, 73, 68, 26, 179, 26, 165, 6, 63, 22, 6, 974, 1708, 124, 966, 645, 124, 146, 578, 1809, 14, 3758, 10, 122, 960, 44, 1817, 387, 891, 212, 46, 70]


In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

6
250
52
250
53
250


## Transformasi data validation

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_reviews)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(test_sequences))
print(test_padded.shape)

53156
(53156, 250)


## Extract Label

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

In [None]:
label_train = train['condition'].values
label_train

array(['Left Ventricular Dysfunction', 'ADHD', 'Birth Control', ...,
       'Rheumatoid Arthritis', 'Underactive Thyroid',
       'Constipation, Chronic'], dtype=object)

In [None]:
print(len(label_train))

159481


In [None]:
enc_label_train = encoder.fit_transform(label_train)
enc_label_train

array([393,   0,  92, ..., 628, 738, 160])

In [None]:
train['condition'].nunique()

808

In [None]:
label_test = test['condition'].values
label_test

array(['Depression', "Crohn's Disease, Maintenance",
       'Urinary Tract Infection', ..., 'Birth Control', 'Pain',
       'Sciatica'], dtype=object)

In [None]:
enc_label_test = encoder.transform(label_test)
enc_label_test

array([189, 169, 744, ...,  92, 521, 642])

In [None]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, embedding_dim),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.Dense(857, activation='softmax')
# ])
# model.summary()

In [None]:
print(len(set(enc_label_train)))
print(len(set(enc_label_test)))


808
637


In [None]:
# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# num_epochs = 18
# history = model.fit(train_padded, 
#                     enc_label_train, 
#                     epochs=num_epochs,
#                     validation_data=(test_padded, enc_label_test), 
#                     verbose=1, 
#                     batch_size=256)

- mist (
- yang ada `</span>`
- `me`
- `min / rosiglitazone)` -> `Rosiglitazon`

## Using MaxPooling1D

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Conv1D(filters=256, kernel_size=5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(857, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 conv1d (Conv1D)             (None, None, 256)         82176     
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 857)               110553    
                                                                 
Total params: 545,625
Trainable params: 545,625
Non-trainable params: 0
__________________________________________________

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 15
history = model.fit(train_padded, 
                    enc_label_train, 
                    epochs=num_epochs,
                    validation_data=(test_padded, enc_label_test), 
                    verbose=1, 
                    batch_size=256)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
116/623 [====>.........................] - ETA: 6:02 - loss: 0.3626 - accuracy: 0.9143

KeyboardInterrupt: ignored

In [None]:
model.save("/content/drive/Shareddrives/ML-Capstone/model/model_mardi1.h5")

## Using LSTM

In [None]:


# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, embedding_dim),
#     tf.keras.layers.LSTM(embedding_dim),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(808, activation='softmax')
# ])

# model.summary()

In [None]:
# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# num_epochs = 20
# history = model.fit(train_padded, 
#                     enc_label_train, 
#                     epochs=num_epochs,
#                     validation_data=(test_padded, enc_label_test), 
#                     verbose=1, 
#                     batch_size=256)