In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Contoh dataset struk
sentences = [
    "Sabun 1 12000",
    "Minuman Soda 2 15000",
    "Snack Coklat 3 10000"
]
labels = [
    ["B-namaBarang", "I-namaBarang", "B-quantity", "B-price"],
    ["B-namaBarang", "I-namaBarang", "I-namaBarang", "B-quantity", "B-price"],
    ["B-namaBarang", "I-namaBarang", "I-namaBarang", "B-quantity", "B-price"]
]

# Tokenisasi teks
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# Konversi kalimat dan label menjadi urutan angka
X = tokenizer.texts_to_sequences(sentences)
label_map = {label: i for i, label in enumerate(set(sum(labels, [])))}
y = [[label_map[label] for label in sequence] for sequence in labels]

# Padding
max_len = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional

# Parameter
num_words = len(word_index) + 1
num_labels = len(label_map)

# Model
model = Sequential([
    Embedding(input_dim=num_words, output_dim=64, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    TimeDistributed(Dense(num_labels, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Ubah label ke format tensor
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)

# Train
model.fit(X_train, y_train, epochs=10, batch_size=16)




Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - accuracy: 0.2500 - loss: 1.3888
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.3750 - loss: 1.3798
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - accuracy: 0.5000 - loss: 1.3710
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.6250 - loss: 1.3621
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.6250 - loss: 1.3529
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.7500 - loss: 1.3433
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.6250 - loss: 1.3332
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.6250 - loss: 1.3223
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x79b43fd25960>

In [None]:
# Contoh prediksi
test_sentence = ["Coklat Snack 3 15000"]
test_seq = pad_sequences(tokenizer.texts_to_sequences(test_sentence), maxlen=max_len, padding='post')

pred = model.predict(test_seq)
pred_labels = [list(label_map.keys())[label] for label in tf.argmax(pred, axis=-1).numpy()[0]]
print(pred_labels)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 719ms/step
['I-namaBarang', 'I-namaBarang', 'I-namaBarang', 'I-namaBarang']
