<a href="https://colab.research.google.com/github/danielsyahputra13/ml_capstone/blob/master/notebooks/GRU_Dense_with_count_100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os

os.chdir("/content/drive/Shareddrives/ML-Capstone/")
os.getcwd()

'/content/drive/Shareddrives/ML-Capstone'

In [None]:
os.listdir("dataset")

['drugsComTrain_raw.tsv',
 'drugsComTest_raw.tsv',
 'train.csv',
 'test.csv',
 'data.csv',
 'train1.csv',
 'test1.csv',
 'train_cleaned.csv',
 'test_cleaned.csv',
 'drugsComTest_raw (1).gsheet',
 'drugsComTest_raw.gsheet',
 'condition.csv',
 'inquirerbasic.csv',
 'condition.xlsx',
 'description.pkl',
 'condition.gsheet']

In [None]:
train = pd.read_csv("dataset/train_cleaned.csv")
test = pd.read_csv("dataset/test_cleaned.csv")

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
data = pd.concat([train, test])
data.info()

In [None]:
count_df = data[['condition','review']].groupby('condition').aggregate({'review':'count'}).reset_index().sort_values('review',ascending=False)
count_df.head()

In [None]:
target_conditions = count_df[count_df['review']>=100]['condition'].values
target_conditions.shape

In [None]:
data = data[data.condition.isin(target_conditions)].copy()

In [None]:
data[data['condition'] == 'Depression']

In [None]:
# pd.DataFrame(data.condition.unique(), columns=['Condition']).to_csv("dataset/condition.csv")

In [None]:
data['drugName'].nunique()

In [None]:
X = data['clean_review'].values
y = data['condition'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022, stratify=y)

In [None]:
X_train

array(['agree rebecca far pregnancy hope continue pill month period mess ive week itll come week early couple day later moody its get well go switch birth control method relieved have problem',
       'start adipex day ago im year oldtall weighlb accord bmi doctor visit breaking point obesity struggle past get goal weight decide adipex shot obesity run family hope stop cycle havent effect far occasional dry mouth drink water good great experience far increase energy suppress appetite want exercisemin day eat well hopefully result end month want goal weight losslb month wish luck',
       'week contrave experience bad headache point take exedrine migraine couple hour dry mouth thats good thing hardly drink water drink lot notice craving go away sweet long go continue medication go call scale program free scale receive',
       ...,
       'advair help breathe line rate effect heartburn intolerable time find relieve discomfort dont appetite good thing maintain blood sugar level weight co

In [None]:
vocab_size = 5000
embedding_dim = 100
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [None]:
print("Train size:", len(X_train))
print("Testing size:", len(X_test))

Train size: 141344
Testing size: 60577


In [None]:
train_reviews = X_train.tolist()
test_reviews = X_test.tolist()

## Transformasi Data Train

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_reviews)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'day': 2,
 'effect': 10,
 'feel': 5,
 'month': 4,
 'start': 9,
 'take': 3,
 'week': 8,
 'work': 7,
 'year': 6}

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_reviews)
print(train_sequences[10])

[16, 784, 28, 6, 4651, 2282, 1278, 24, 6, 128, 784, 35, 184, 909, 290, 99, 184, 660, 472, 844, 200, 146, 31, 54, 4, 2475, 895, 12, 21, 210, 3, 844, 99, 184, 10, 672, 4651, 1, 191, 660, 472, 485, 29, 239, 1633, 15, 23, 4, 1394, 742, 86, 415, 49, 172]


In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

31
100
61
100
54
100


Convert Tokenizer into JSON

In [None]:
# import io
# import json

# tokenizer_json = tokenizer.to_json()
# with io.open('assets/tokenizer_with_counts_100.json', 'w', encoding='utf-8') as f:
#     f.write(json.dumps(tokenizer_json, ensure_ascii=False))

## Transformasi Data Test

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_reviews)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(test_sequences))
print(test_padded.shape)

60577
(60577, 100)


## Extract Label

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

label_train = y_train

enc_label_train = encoder.fit_transform(label_train)
enc_label_train

array([ 33, 181, 120, ...,  41, 120, 149])

In [None]:
label_test = y_test
enc_label_test = encoder.transform(label_test)
enc_label_test

array([181,  16,  15, ...,  71,   6,   1])

In [None]:
# np.save('assets/encoder_with_count_100.npy', encoder.classes_)

In [None]:
NUM_OUTPUTS = len(set(y_train))
NUM_OUTPUTS

185

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, dropout = 0.1, recurrent_dropout = 0.5, return_sequences = True)),
    tf.keras.layers.GRU(64, activation = 'relu', dropout = 0.1, recurrent_dropout = 0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(NUM_OUTPUTS, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         500000    
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        63744     
 l)                                                              
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 185)               23865     
                                                                 
Total params: 633,177
Trainable params: 633,177
Non-trainable params: 0
__________________________________________________

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 30
history = model.fit(train_padded,
                    enc_label_train,
                    epochs=num_epochs,
                    validation_data=(test_padded, enc_label_test),
                    verbose=1, 
                    batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
model.save("model/GRU_dense_with_count_100.h5")