In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# load data

In [None]:
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/dataset_no_stemming_no_NA.xlsx')
df

Unnamed: 0,label,string
0,Sumber Daya Alam,kunjung prabowo resmi serah proyek bantu air b...
1,Politik,anies tepuk tangan riah rektor wajib mata kuli...
2,Demografi,sih dukung dukung ridwan kamil skema balik kal...
3,Politik,anies sikap kritis kerja prabowo anggap sopan ...
4,Politik,anies baswedan harap asn tni polri pegang sump...
...,...,...
4569,Politik,debat kemarin pas prabowo diam keluarga laku a...
4570,Politik,masyarakat prabowo gibran milik visi jalan asp...
4571,Ekonomi,both are irrational but irrational tbh but nev...
4572,Pertahanan dan Keamanan,look at that ganjar kecimpung legislatif eksek...


In [None]:
text = df.string
label = df.label

# encode label

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

label_encoder = LabelEncoder()
# mengubah column label yang sudaah diencoded (0 1 2 3 ...) menjadi array
encoded_labels = np.array(label_encoder.fit_transform(label))

In [None]:
encoded_labels

array([7, 5, 0, ..., 1, 4, 7])

# load fast text model

In [None]:
from gensim.models import FastText

fasttext_model = FastText.load('/content/drive/MyDrive/fast_text').wv

In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

def norm_sentence_vector(sentence, w2v_model):
    list_of_words = [word for word in word_tokenize(sentence)]
    if len(list_of_words) == 0:
        return np.zeros(w2v_model.vector_size)
    vectors = [w2v_model[word] for word in list_of_words]
    vectors = [vec / np.linalg.norm(vec) for vec in vectors if np.linalg.norm(vec) > 0]
    sentence_vectors = np.mean(vectors, axis=0)
    return sentence_vectors

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
vectors = [norm_sentence_vector(sentence, fasttext_model) for sentence in text]
vectors = np.array(vectors)
vectors

array([[-0.02507637,  0.04334563,  0.05712339, ...,  0.07183099,
         0.11890034,  0.02410801],
       [-0.02256739, -0.02723916,  0.01314892, ..., -0.02276447,
         0.04404201,  0.01067955],
       [ 0.03513144,  0.03355055, -0.01566434, ...,  0.00856539,
         0.00327201,  0.02849905],
       ...,
       [ 0.02666968, -0.04206425, -0.00456647, ...,  0.03685997,
         0.0463332 ,  0.00532845],
       [-0.00696157, -0.00982579, -0.02834507, ...,  0.0186976 ,
         0.00264118,  0.02309118],
       [ 0.05915273, -0.02495828, -0.01727284, ...,  0.02846403,
         0.03599015,  0.00625555]], dtype=float32)

# oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)
data_resampled, labels_resampled = oversampler.fit_resample(vectors, encoded_labels)

In [None]:
from tensorflow.keras.utils import to_categorical

# jumlah kelas
num_classes = len(label_encoder.classes_)

# convert oversampled labels to categorical (one-hot encoded)
categorical_labels_resampled = to_categorical(labels_resampled, num_classes)

In [None]:
categorical_labels_resampled

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

# split train test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_resampled, categorical_labels_resampled, test_size=0.1, random_state=42)

In [None]:
y_train

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
X_train

array([[ 0.0148256 ,  0.01823165,  0.02949005, ..., -0.01721282,
         0.00295838, -0.01668224],
       [ 0.05834955,  0.02690668,  0.01819873, ...,  0.01765052,
        -0.01533098, -0.00969144],
       [ 0.00380409, -0.00991156, -0.00049538, ...,  0.0157887 ,
         0.02287425,  0.00825186],
       ...,
       [-0.01536444,  0.01259366, -0.00467653, ..., -0.01527123,
        -0.00394591,  0.01479772],
       [ 0.02873073, -0.008885  , -0.00479184, ...,  0.04052548,
         0.01697306,  0.02949303],
       [ 0.02554273,  0.01689676,  0.04354575, ...,  0.02797827,
        -0.00683479,  0.02490154]], dtype=float32)

# build model

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [None]:
len(word_index)

6880

In [None]:
len(vectors)

4574

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim=len(vectors),
                    output_dim=128,
                    weights=[vectors],
                    trainable=True))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.01),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 128)         585472    
                                                                 
 lstm_4 (LSTM)               (None, None, 128)         131584    
                                                                 
 dropout_4 (Dropout)         (None, None, 128)         0         
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 8)                 520       
                                                                 
Total params: 766984 (2.93 MB)
Trainable params: 76698

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
history = model.fit(X_train, y_train, epochs=40, validation_split=0.1, callbacks=[early_stopping])

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Test Accuracy: 11.82%


# balanced accuracy

In [None]:
from tensorflow.keras import backend as K

def balanced_accuracy(y_true, y_pred):
    # convert hasil prediksi dari one hot encoded ke multi-class
    y_pred = K.argmax(y_pred, axis=-1)
    y_true = K.argmax(y_true, axis=-1)

    # true positive, true negative, false positive, false negative
    true_pos = K.sum(K.cast(y_true == y_pred, 'float32') * K.cast(y_true == 1, 'float32'))
    true_neg = K.sum(K.cast(y_true == y_pred, 'float32') * K.cast(y_true == 0, 'float32'))
    false_pos = K.sum(K.cast(y_true != y_pred, 'float32') * K.cast(y_true == 0, 'float32'))
    false_neg = K.sum(K.cast(y_true != y_pred, 'float32') * K.cast(y_true == 1, 'float32'))

    # recall
    recall_pos = true_pos / (true_pos + false_neg + K.epsilon())
    recall_neg = true_neg / (true_neg + false_pos + K.epsilon())

    balanced_acc = (recall_pos + recall_neg) / 2
    return balanced_acc

In [None]:
prediction = model.predict(X_test)

acc = balanced_accuracy(y_test, prediction)

print(f'Balanced accuracy: {acc * 100: .2f}%')

Balanced accuracy:  0.00%
