In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModel, TFBertModel
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
import IPython

In [None]:
# GET the data
## Memuat Data

dataset = pd.read_csv("/content/Capres2014-1.1.csv", usecols=["Isi_Tweet", "Sentimen"])

In [None]:
# EXPLORE the data
## Menampilkan lima data terakhir
dataset.tail()

Unnamed: 0,Isi_Tweet,Sentimen
1880,Jangan kabur dari tanggung jawab dengan kemasa...,1
1881,@echo_hadiwibowo mana berani pmrntah..m G da y...,1
1882,@IndonesiaCapres ANAK MEDAN DUKUNG CAPRES JK D...,1
1883,"RT @idoidonajib: Jelek! ""@fallenokta: Apa komp...",1
1884,"Langsung deh ngadu ke capres Hatta Rajasa, bia...",1


In [None]:
## Mengecek Imbalanced Data
dataset['Sentimen'].value_counts()

 1    1117
-1     768
Name: Sentimen, dtype: int64

In [None]:
## Mengganti {-1,1} menjadi {0,1}
dataset['Sentimen'] = dataset['Sentimen'].replace(-1,0)

dataset['Sentimen'].value_counts()

1    1117
0     768
Name: Sentimen, dtype: int64

In [None]:
# MODEL the data
## Pra Pengolahan - Cleaning

def clean_text(tweet):
    
    # Convert to lower case
    tweet = tweet.lower()
    # remove unicode characters
    tweet = tweet.encode('ascii', 'ignore').decode()
    # Clean www.* or https?://*
    tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    # Clean @username
    tweet = re.sub(r'@[^\s]+','',tweet)
    #Remove additional white spaces
    tweet = re.sub(r'[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #trim
    tweet = tweet.strip('\'"')
    
    return tweet

dataset["Isi_Tweet"] = dataset['Isi_Tweet'].map(lambda x: clean_text(x))
dataset = dataset[dataset['Isi_Tweet'].apply(lambda x: len(x.split()) >=1)]
dataset.shape

(1885, 2)

In [None]:
## Pra Pengolahan - Splitting

train_data, test_data, train_labels, test_labels = train_test_split(
    dataset['Isi_Tweet'], dataset['Sentimen'], test_size=0.2, random_state=42)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.26.1


In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
def tokenisasi(teks):
    encode_dict = bert_tokenizer(teks,
                                   add_special_tokens = True,
                                   max_length = 128, 
                                   padding = 'max_length',
                                   truncation = True,
                                   return_attention_mask = True,
                                   return_tensors = 'tf',)

    tokenID = encode_dict['input_ids']
    attention_mask = encode_dict['attention_mask']

    return tokenID, attention_mask

def create_input(data):
    tokenID, input_mask = [], []
    for teks in data:
        token, mask = tokenisasi(teks)
        tokenID.append(token)
        input_mask.append(mask)
    
    return [np.asarray(tokenID, dtype=np.int32).reshape(-1, 128), 
            np.asarray(input_mask, dtype=np.int32).reshape(-1, 128)]

bert_model = TFAutoModel.from_pretrained("indobenchmark/indobert-base-p2", trainable=False)
def bert(hp):
    
    #Input layer
    input_token = keras.layers.Input(shape=(128,), dtype=np.int32,
                                        name="input_token")
    input_mask = keras.layers.Input(shape=(128,), dtype=np.int32,
                                   name="input_mask")

    #hidden layer
    hid_layer = hp.Choice('hid_layer_type', ['lstm', 'dense', 'both', 'none'])

    # hidden layer dense (bert embedding -> dense -> output)
    if hid_layer == 'dense':
        bert_embedding = bert_model([input_token, input_mask])[1]
        n_hid_layer = hp.Choice('n_layer', [1, 2])

        if n_hid_layer == 1:
            hidden_0 = keras.layers.Dense(hp.Int('neuron_0', 32, 128), activation='relu',
                                        kernel_regularizer=keras.regularizers.l2(hp.Float('l2_hidden_0', .001, .1)))(bert_embedding) 
            output = keras.layers.Dense(1, activation='sigmoid',
                                        kernel_regularizer=keras.regularizers.l2(hp.Float('l2_output', .001, .1)))(hidden_0)  
        else:
            hidden_0 = keras.layers.Dense(hp.Int('neuron_0', 32, 128), activation='relu',
                                        kernel_regularizer=keras.regularizers.l2(hp.Float('l2_hidden_0', .001, .1)))(bert_embedding)
            hidden_1 = keras.layers.Dense(hp.Int('neuron_1', 32, 128), activation='relu',
                                        kernel_regularizer=keras.regularizers.l2(hp.Float('l2_hidden_1', .001, .1)))(hidden_0) 
            output = keras.layers.Dense(1, activation='sigmoid',
                                        kernel_regularizer=keras.regularizers.l2(hp.Float('l2_output', .001, .1)))(hidden_1)
    
    # hidden layer lstm (bert embedding -> lstm -> output)
    elif hid_layer == 'lstm':
        bert_embedding = bert_model([input_token, input_mask])[0]
        bidirectional = hp.Choice('lstm_bidirectional', [True, False])

        if bidirectional:
            lstm = keras.layers.Bidirectional(CuDNNLSTM(hp.Int('lstm_neuron', 32, 128),
                                                        kernel_regularizer=keras.regularizers.l2(hp.Float('l2_lstm', .001, .1))))(bert_embedding)
        else:
            lstm = CuDNNLSTM(hp.Int('lstm_neuron', 32, 128),
                             kernel_regularizer=keras.regularizers.l2(hp.Float('l2_lstm', .001, .1)))(bert_embedding)
        output = keras.layers.Dense(1, activation='sigmoid',
                                    kernel_regularizer=keras.regularizers.l2(hp.Float('l2_output', .001, .1)))(lstm)

    # hidden layer lstm + dense (bert embedding -> lstm -> dense -> output)
    elif hid_layer == 'both':
        bert_embedding = bert_model([input_token, input_mask])[0]
        lstm = keras.layers.Bidirectional(CuDNNLSTM(hp.Int('lstm_neuron', 32, 128),
                                                    kernel_regularizer=keras.regularizers.l2(hp.Float('l2_lstm', .001, .1))))(bert_embedding)
        hidden_0 = keras.layers.Dense(hp.Int('neuron_0', 32, 128), activation='relu',
                                      kernel_regularizer=keras.regularizers.l2(hp.Float('l2_hidden_0', .001, .1)))(lstm)
        output = keras.layers.Dense(1, activation='sigmoid',
                                    kernel_regularizer=keras.regularizers.l2(hp.Float('l2_output', .001, .1)))(hidden_0)

    # no hiddden layer (bert embedding -> output)
    else:
        bert_embedding = bert_model([input_token, input_mask])[1]
        output = keras.layers.Dense(1, activation='sigmoid',
                                    kernel_regularizer=keras.regularizers.l2(hp.Float('l2_output', .001, .1)))(bert_embedding)

    
    model = keras.models.Model(inputs=[input_token, input_mask], outputs=output)

    model.compile(optimizer=keras.optimizers.Adam(hp.Float('lr', 1e-3, 5e-2)), loss='binary_crossentropy', metrics=['accuracy'])
   
    return model

class ClearTrainingOutput(keras.callbacks.Callback):
    def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,
                                           restore_best_weights=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/656M [00:00<?, ?B/s]

Some layers from the model checkpoint at indobenchmark/indobert-base-p2 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.0-py3-none-any.whl (167 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.3/167.3 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: kt-legacy, jedi, keras-tuner
Successfully installed jedi-0.18.2 keras-tuner-1.3.0 kt-legacy-1.0.4


In [None]:
from keras_tuner.tuners import BayesianOptimization

bert_train_data = create_input(train_data)
bert_test_data = create_input(test_data)

tuner = BayesianOptimization(bert,
                             objective = 'val_accuracy', 
                             max_trials = 25,
                             directory = 'tune',
                             project_name = 'Sentiment-BERT',
                             overwrite = True)

tuner.search(bert_train_data, train_labels,
             batch_size=256, epochs=50,
             validation_data=(bert_test_data, test_labels),
             callbacks=[early_stop, ClearTrainingOutput()])

Trial 25 Complete [00h 07m 09s]
val_accuracy: 0.8275862336158752

Best val_accuracy So Far: 0.8514589071273804
Total elapsed time: 03h 55m 51s


In [None]:
tuner.results_summary()

Results summary
Results in tune/Sentiment-BERT
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x7fb6f91da160>
Trial summary
Hyperparameters:
hid_layer_type: lstm
lstm_bidirectional: 1
lstm_neuron: 127
l2_lstm: 0.0032251716258970705
l2_output: 0.06278849212607956
lr: 0.005534638759858194
Score: 0.8514589071273804
Trial summary
Hyperparameters:
hid_layer_type: lstm
lstm_bidirectional: 1
lstm_neuron: 50
l2_lstm: 0.021584928051188775
l2_output: 0.09196880065556831
lr: 0.011835662468733413
neuron_0: 77
l2_hidden_0: 0.045830864494719986
Score: 0.8381962776184082
Trial summary
Hyperparameters:
hid_layer_type: both
lstm_bidirectional: 1
lstm_neuron: 44
l2_lstm: 0.06044418802496354
l2_output: 0.015050653620328022
lr: 0.007628435985180532
neuron_0: 103
l2_hidden_0: 0.005753463232814367
n_layer: 2
Score: 0.8328912258148193
Trial summary
Hyperparameters:
hid_layer_type: lstm
lstm_bidirectional: 1
lstm_neuron: 127
l2_lstm: 0.01795837346561108
l2_output: 0.06991510565543062

In [None]:
# Train with higher epochs
input_token = keras.layers.Input(shape=(128,), dtype=np.int32,
                                 name="input_token")
input_mask = keras.layers.Input(shape=(128,), dtype=np.int32,
                                name="input_mask")

bert_model = TFAutoModel.from_pretrained("indobenchmark/indobert-base-p2", trainable=False)
bert_embedding = bert_model([input_token, input_mask])[0]

lstm = keras.layers.Bidirectional(CuDNNLSTM(127, kernel_regularizer=keras.regularizers.l2(0.00322)))(bert_embedding)

output = keras.layers.Dense(1, activation='sigmoid', 
                            kernel_regularizer=keras.regularizers.l2(0.06278))(lstm)

model = keras.models.Model(inputs=[input_token, input_mask], outputs=output)

model.compile(optimizer=keras.optimizers.Adam(0.00553),
               loss='binary_crossentropy',
               metrics=['accuracy'])

model.summary()

Some layers from the model checkpoint at indobenchmark/indobert-base-p2 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model_7 (TFBertModel)  TFBaseModelOutputWi  124441344   ['input_token[0][0]',            
                                thPoolingAndCrossAt               'input_mask[0][0]']             
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

In [None]:
bert_train_data = create_input(train_data)
bert_test_data = create_input(test_data)

early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20,
                                           restore_best_weights=True)

model.fit(bert_train_data, train_labels, epochs=100, batch_size=256,
           validation_data=(bert_test_data, test_labels),
           callbacks=[early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


<keras.callbacks.History at 0x7fc214c237c0>

In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/model_mlp_sentiment_8806.h5')

In [None]:
model = keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/model_mlp_sentiment_8806.h5',
                                 custom_objects={"TFBertModel": TFBertModel})

model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model_10 (TFBertModel)  TFBaseModelOutputWi  124441344  ['input_token[0][0]',            
                                thPoolingAndCrossAt               'input_mask[0][0]']             
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

In [None]:
test_loss, test_acc = model.evaluate(bert_test_data, test_labels)
print('Test accuracy:', test_acc)

Test accuracy: 0.8806366324424744
