### Import Library

In [253]:
import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, LSTM, Input, Dropout, Bidirectional
from tensorflow.keras.metrics import Precision, Recall 


### Data Loading

In [254]:
df = pd.read_csv("./output/data_clean.csv")

df.sample()

Unnamed: 0,content,score,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_final,polarity_score,polarity
3871,cepat mudah dan bermanfaat,5,cepat mudah dan bermanfaat,cepat mudah dan bermanfaat,cepat mudah dan bermanfaat,"['cepat', 'mudah', 'dan', 'bermanfaat']","['cepat', 'mudah', 'bermanfaat']",cepat mudah bermanfaat,12,positive


### Data Preprocessing

Label Encoder to convert categorical label to numeric

In [255]:
label_encoder = preprocessing.LabelEncoder() 
df['polarity_encode'] = label_encoder.fit_transform(df['polarity']) 

In [256]:
df.sample()

Unnamed: 0,content,score,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_final,polarity_score,polarity,polarity_encode
10340,Selalu upgrade tentang promosi atau info menar...,4,Selalu upgrade tentang promosi atau info menar...,selalu upgrade tentang promosi atau info menar...,selalu upgrade tentang promosi atau info menar...,"['selalu', 'upgrade', 'tentang', 'promosi', 'a...","['upgrade', 'promosi', 'info', 'menarik', 'apl...",upgrade promosi info menarik aplikasi mudah fr...,4,positive,2


Tokenization 

In [257]:
sentences = df['text_final'].astype(str).tolist()

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index 

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [258]:
with open('./assets/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [259]:
df.sample()

Unnamed: 0,content,score,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_final,polarity_score,polarity,polarity_encode
2688,mempermudah kita yang malas belanja keluar rum...,5,mempermudah kita yang malas belanja keluar rum...,mempermudah kita yang malas belanja keluar rum...,mempermudah kita yang malas belanja keluar rum...,"['mempermudah', 'kita', 'yang', 'malas', 'bela...","['mempermudah', 'malas', 'belanja', 'rumah', '...",mempermudah malas belanja rumah keren,1,positive,2


In [260]:
dataset = df.values

In [261]:
X_input = np.array(padded).astype('float32')
y_input = np.array(dataset[:,10]).astype('float32')

Feature Engineering: Oversampling

In [262]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_input, y_input)

print("Jumlah Data Setelah ROS:", len(X_resampled))

Jumlah Data Setelah ROS: 20118


In [263]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, stratify=y_resampled, test_size=0.3)

### Modelling

LSTM model

In [264]:
model = Sequential([
    Input(shape=(100,)),
    Embedding(input_dim=5000, output_dim=128),

    Bidirectional(LSTM(units=64, dropout=0.4, recurrent_dropout=0.2)),

    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

In [265]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

Callback checkpoint to save best model accuracy

In [266]:
checkpoint = ModelCheckpoint(
    './assets/best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max'
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

In [267]:
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=128,
    validation_split=0.2,
    callbacks=[checkpoint, early_stop]
)

Epoch 1/20


[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 337ms/step - accuracy: 0.6524 - loss: 0.7680 - val_accuracy: 0.8644 - val_loss: 0.4300
Epoch 2/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 328ms/step - accuracy: 0.8910 - loss: 0.3301 - val_accuracy: 0.9088 - val_loss: 0.2677
Epoch 3/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 328ms/step - accuracy: 0.9389 - loss: 0.1891 - val_accuracy: 0.9414 - val_loss: 0.1888
Epoch 4/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 326ms/step - accuracy: 0.9604 - loss: 0.1293 - val_accuracy: 0.9471 - val_loss: 0.1691
Epoch 5/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 326ms/step - accuracy: 0.9698 - loss: 0.0991 - val_accuracy: 0.9368 - val_loss: 0.2162
Epoch 6/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 327ms/step - accuracy: 0.9791 - loss: 0.0700 - val_accuracy: 0.9489 - val_loss: 0.1988
Epoch 7/20
[1m89/89[0m [32m━━━

### Evaluation

Model evaluation

In [268]:
model.evaluate(X_test, y_test, batch_size=1)

[1m6036/6036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 16ms/step - accuracy: 0.9486 - loss: 0.1729


[0.1728813648223877, 0.9486414790153503]