In [None]:
import json
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/MyDrive/capstone_data/train.json') as file:
    train_data = json.load(file)

with open('/content/drive/MyDrive/capstone_data/validation.json') as file:
    validation_data = json.load(file)

In [None]:
def labelling(json_data):
    keywords, theme, recom = [], [], []
    theme_label = {}

    for news in json_data['berita']:
        for keyword in news['keywords']:
            keywords.append(keyword)
            theme.append(news['tema_berita'])
            recom.append(news['recommendation'])

        for tema in news['tema_berita']:
            if tema not in theme_label:
                theme_label[news['tema_berita']] = news['recommendation']

    return keywords, theme, recom, theme_label

In [None]:
train_keywords, train_theme, train_recom, train_theme_label = labelling(train_data)

val_keywords, val_theme, val_recom, val_theme_label = labelling(validation_data)

In [None]:
data = pd.DataFrame({
    'keywords': train_keywords,
    'theme': train_theme,
    'recommendation': train_recom
})

data.sample(10)

Unnamed: 0,keywords,theme,recommendation
57,polisi,polisi,"[Kejahatan, Polisi]"
122,rumah sakit,rumah sakit,"[Rumah Sakit, Penyakit, Kesehatan]"
8,bocor,kebakaran,"[Kebakaran, Pemadam Kebakaran]"
42,pertamina,kebakaran,"[Kebakaran, Pemadam Kebakaran]"
151,jantung,rumah sakit,"[Rumah Sakit, Penyakit, Kesehatan]"
100,keadilan,polisi,"[Kejahatan, Polisi]"
37,demo,kebakaran,"[Kebakaran, Pemadam Kebakaran]"
70,tindak pidana,polisi,"[Kejahatan, Polisi]"
87,keamanan,polisi,"[Kejahatan, Polisi]"
20,percikan,kebakaran,"[Kebakaran, Pemadam Kebakaran]"


In [None]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(train_keywords)
train_sequences = tokenizer.texts_to_sequences(train_keywords)
word_index = tokenizer.word_index
max_sequence_length = max([len(seq) for seq in train_sequences])
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)

tokenizer.fit_on_texts(val_keywords)
val_sequences = tokenizer.texts_to_sequences(val_keywords)
word_index = tokenizer.word_index
max_sequence_length = max([len(seq) for seq in val_sequences])
val_padded_sequences = pad_sequences(val_sequences, maxlen=max_sequence_length)

In [None]:
label_mapping = {'kebakaran': 0, 'polisi': 1, 'rumah sakit': 2}
num_classes = len(set(val_theme))

X_train = train_padded_sequences
y_train = tf.keras.utils.to_categorical([label_mapping[label] for label in train_theme], num_classes=num_classes)

X_test = val_padded_sequences
y_test = tf.keras.utils.to_categorical([label_mapping[label] for label in val_theme], num_classes=num_classes)

In [None]:
y_test


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100))
model.add(LSTM(128))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_train, y_train))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f1f8d277550>

In [None]:
test_sentence = """
rumah hangus terbakar

"""
test_sequence = tokenizer.texts_to_sequences([test_sentence])
padded_test_sequence = pad_sequences(test_sequence, maxlen=max_sequence_length)
prediction = model.predict(padded_test_sequence)[0]

recommended_hobby_index = tf.argmax(prediction).numpy()
recommended_hobby = [label for label, index in label_mapping.items() if index == recommended_hobby_index][0]

print("Rekomendasi: " + recommended_hobby)

Rekomendasi: kebakaran
