In [1]:
import numpy as np
import os
import time

In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://www.sastra.org/bahasa-dan-budaya/kagunan/1399-kawruh-ambathik-nyerat-kajawen-1938-1671"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")


In [2]:
content = []
paragraphs = soup.find_all("p")
for p in paragraphs:
    content.append(p.get_text())

In [9]:
content[:10]

['Lingkup pencarian: teks dan catatan-kakinya. Teks pencarian: 2-24 karakter. Filter pencarian: huruf besar/kecil, diakritik serta pungtuasi diabaikan; karakter [?] dapat digunakan sebagai pengganti zero atau satu huruf sembarang; simbol wildcard [*] dapat digunakan sebagai pengganti zero atau sejumlah karakter termasuk spasi; mengakomodasi variasi ejaan, antara lain [dj : j, tj : c, j : y, oe : u, d : dh, t : th].',
 '--- 390 ---',
 'Dumuginipun ing măngsa punika, ingkang nama taksih têtêp migunakakên sinjang, namung tiyang èstri, tuwin saprika-sapriki, ingkang nama sinjang sae, punika botên ewah, inggih punika sinjang bathik (sêratan tangan). Ingkang nama sinjang bathik punika manawi dipun tandhing saenipun tinimbang sinjang cap, kathah sangêt bedanipun, makatên ugi awèting panganggenipun, mila rêganipun inggih tikêlan.',
 'Bakuning sinjang bathik ingkang nama sae, punika kêdah mrêtamèni, kados ta mori kêdah sae. Sêratanipun ingkang alus, nêtês, têgêsipun nêtês, canthing-canthing ing

In [4]:
clean_content = []
for item in content:
    if not item.strip().startswith("---") and not item.strip().endswith("---"):
        clean_content.append(item)


In [10]:
clean_content[:10]

['Lingkup pencarian: teks dan catatan-kakinya. Teks pencarian: 2-24 karakter. Filter pencarian: huruf besar/kecil, diakritik serta pungtuasi diabaikan; karakter [?] dapat digunakan sebagai pengganti zero atau satu huruf sembarang; simbol wildcard [*] dapat digunakan sebagai pengganti zero atau sejumlah karakter termasuk spasi; mengakomodasi variasi ejaan, antara lain [dj : j, tj : c, j : y, oe : u, d : dh, t : th].',
 'Dumuginipun ing măngsa punika, ingkang nama taksih têtêp migunakakên sinjang, namung tiyang èstri, tuwin saprika-sapriki, ingkang nama sinjang sae, punika botên ewah, inggih punika sinjang bathik (sêratan tangan). Ingkang nama sinjang bathik punika manawi dipun tandhing saenipun tinimbang sinjang cap, kathah sangêt bedanipun, makatên ugi awèting panganggenipun, mila rêganipun inggih tikêlan.',
 'Bakuning sinjang bathik ingkang nama sae, punika kêdah mrêtamèni, kados ta mori kêdah sae. Sêratanipun ingkang alus, nêtês, têgêsipun nêtês, canthing-canthing ingkang cêtha awija

In [13]:
output_file = "scrapped.txt"
with open(output_file, "w") as file:
    for item in clean_content:
        file.write(item + "\n")
        
    file.close()

print(f"Scraping done {output_file}")

Scraping done scrapped.txt


In [3]:
# utf 8
import re

standardize_map = {
    'ê': 'e', 'è': 'e', 'é': 'e', 'ë': 'e', 'Ê' : 'E',
    'à': 'a', 'á': 'a', 'â': 'a', 'ä' : 'a', 'ă' :'a',
    'î': 'i', 'ì': 'i', 'í': 'i', 'ï': 'i',
    'ô': 'o', 'ò': 'o', 'ó': 'o', 'ö': 'o',
    'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u'
}

def standardize_text(text):
    for diacritic_char, standard_char in standardize_map.items():
        text = text.replace(diacritic_char, standard_char)
    return text

with open('scrapped.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    file.close()

standardized_text = standardize_text(text).lower()

with open('standardized.txt', 'w', encoding='utf-8') as file:
    file.write(standardized_text)
    file.close()



In [4]:
# remove punc
import re
import string

def remove_punctuation(text):

    pattern = f"[{re.escape(string.punctuation)}]"
    text_no_punctuation = re.sub(pattern, "", text)
    
    return text_no_punctuation

v2_standardized_text = remove_punctuation(standardized_text)

with open('standardized-2.txt', 'w', encoding='utf-8') as file:
    file.write(v2_standardized_text)
    file.close()


In [5]:
# remove empty lines
def remove_empty_lines(text):
    lines = text.split("\n")
    
    non_empty_lines = [line for line in lines if line.strip()]
    
    cleaned_text = "\n".join(non_empty_lines)
    
    return cleaned_text    


v3_standardized_text = remove_empty_lines(v2_standardized_text)

with open('standardized-3.txt', 'w', encoding='utf-8') as file:
    file.write(v3_standardized_text)
    file.close()

In [10]:
# count
def count_words(text):
    words = text.split()
    return len(words)

word_count = count_words(v2_standardized_text)

print("total :", word_count)


total : 9365


In [11]:
# unique
def get_unique_words(text):
    words = text.split()
    
    unique_words = set(word.lower() for word in words)
    
    return list(unique_words)

unique_words = get_unique_words(v3_standardized_text)
length_of_unique_words = len(unique_words)

print("Jumlah Kata Unik:", len(unique_words))


Jumlah Kata Unik: 2885


In [12]:
# [[],[]]
def text_to_2d_list(text):

    lines = text.split("\n")
    
    words_2d_list = [line.split() for line in lines if line.strip()]
    
    return words_2d_list


words_2d_list = text_to_2d_list(v3_standardized_text)

print(words_2d_list[:5])
# v3_standardized_text


[['lingkup', 'pencarian', 'teks', 'dan', 'catatankakinya', 'teks', 'pencarian', '224', 'karakter', 'filter', 'pencarian', 'huruf', 'besarkecil', 'diakritik', 'serta', 'pungtuasi', 'diabaikan', 'karakter', 'dapat', 'digunakan', 'sebagai', 'pengganti', 'zero', 'atau', 'satu', 'huruf', 'sembarang', 'simbol', 'wildcard', 'dapat', 'digunakan', 'sebagai', 'pengganti', 'zero', 'atau', 'sejumlah', 'karakter', 'termasuk', 'spasi', 'mengakomodasi', 'variasi', 'ejaan', 'antara', 'lain', 'dj', 'j', 'tj', 'c', 'j', 'y', 'oe', 'u', 'd', 'dh', 't', 'th'], ['anggitanipun', 'dawud', 'magang', 'guru', 'ing', 'masaran'], ['kacariyos', 'ing', 'jaman', 'kina', 'wonten', 'satunggiling', 'warandha', 'sampun', 'sepuh', 'anama', 'bok', 'randha', 'sambega', 'gegriya', 'wonten', 'ing', 'padhekahan', 'alit', 'anama', 'padhekahan', 'sidhangmiring', 'bawah', 'ing', 'sukawati'], ['bok', 'randha', 'sambega', 'gadhah', 'anak', 'estri', 'satunggil', 'nama', 'pun', 'suwidak', 'loro', 'mila', 'nama', 'makaten', 'awit', '

In [13]:
def build_indices(unique_words):
    word_to_idx = {}
    idx_to_word = {}
    for i, word in enumerate(unique_words):
        word_to_idx[word] = i
        idx_to_word[i] = word
    return word_to_idx, idx_to_word

word_to_idx, idx_to_word = build_indices(unique_words)

# word_to_idx: Memetakan kata ke indeks.
# idx_to_word: Memetakan indeks ke kata.

In [14]:
# word_to_idx
for i, (word, idx) in enumerate(word_to_idx.items()):
    print(f"{word}: {idx}")
    if i == 9:
        break


adigang: 0
kalebu: 1
malongo: 2
pados: 3
ntenpinten: 4
tetiga: 5
keceran: 6
kadhawuhan: 7
andadosaken: 8
ngarani: 9


In [15]:
def prepare_corpus(corpus, word_to_idx):
    sequences = []
    
    for line in corpus:
        tokens = line
        missing_tokens = [token for token in tokens if token not in word_to_idx]
        
        for token in missing_tokens:
            word_to_idx[token] = len(word_to_idx)  
            print(f"Token '{token}' ditambahkan ke word_to_idx.")
        
        for i in range(1, len(tokens)):
            i_gram_sequence = tokens[:i+1]
            i_gram_sequence_ids = [
                word_to_idx[token] for token in i_gram_sequence
            ]
            sequences.append(i_gram_sequence_ids)
    
    return sequences

sequences = prepare_corpus(words_2d_list, word_to_idx)
max_sequence_len = max([len(x) for x in sequences])

# Membuat urutan token (n-gram).
# Menambahkan kata baru ke word_to_idx jika belum ada.
# Menghasilkan daftar urutan numerik yang siap digunakan oleh model.

In [16]:
sequences[0]

[1972, 879]

In [17]:
print(idx_to_word[1647])
print(idx_to_word[867])
print(idx_to_word[1452])

lebokake
wekasan
berbudi


In [18]:
len(sequences)

9210

In [19]:
max_sequence_len

146

2024-12-05 11:02:29.979275: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 11:02:30.110162: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 11:02:30.230480: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733371350.335565  373066 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733371350.367388  373066 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 11:02:30.608833: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [23]:
X

array([[   0,    0,    0, ...,    0,    0, 1972],
       [   0,    0,    0, ...,    0, 1972,  879],
       [   0,    0,    0, ..., 1972,  879,  480],
       ...,
       [   0,    0,    0, ..., 1135, 2103, 1058],
       [   0,    0,    0, ..., 2103, 1058,  825],
       [   0,    0,    0, ..., 1058,  825, 2595]], dtype=int32)

In [24]:
from keras.layers import Dense, LSTM, Dropout, Embedding, BatchNormalization, Bidirectional
from keras.models import Sequential

def create_model(max_sequence_len, length_of_unique_words):
    model = Sequential([
        Embedding(length_of_unique_words, 64, input_length= max_sequence_len-1),
        Bidirectional(LSTM(64,return_sequences=True)),
        Bidirectional(LSTM(64)),
        BatchNormalization(),
        Dense(128, activation="relu"),
        Dense(length_of_unique_words, activation="softmax")
    ])
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'] 
    )
    
    return model


In [25]:
model = create_model(max_sequence_len, length_of_unique_words)
model.summary()

W0000 00:00:1733371414.798208  373066 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [26]:
len(X)

9210

In [27]:
model.fit(X, y, batch_size = 512, epochs=75)

Epoch 1/75


2024-12-05 11:04:33.858285: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 106283400 exceeds 10% of free system memory.
2024-12-05 11:04:41.515116: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.
2024-12-05 11:04:41.527986: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.
2024-12-05 11:04:41.538020: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.
2024-12-05 11:04:42.013312: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 959ms/step - accuracy: 0.0048 - loss: 7.9039
Epoch 2/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 779ms/step - accuracy: 0.0275 - loss: 7.0669
Epoch 3/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 792ms/step - accuracy: 0.0357 - loss: 6.7405
Epoch 4/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 760ms/step - accuracy: 0.0317 - loss: 6.6345
Epoch 5/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 795ms/step - accuracy: 0.0342 - loss: 6.5229
Epoch 6/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 783ms/step - accuracy: 0.0356 - loss: 6.4393
Epoch 7/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 735ms/step - accuracy: 0.0336 - loss: 6.3145
Epoch 8/75
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 734ms/step - accuracy: 0.0366 - loss: 6.2035
Epoch 9/75
[1m18/18[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fd26013ae50>

In [28]:
# def generate_text(seed_text, next_words, model, max_seq_len):
#     for _ in range(next_words):
#         sequences= prepare_corpus(words_2d_list[2], word_to_idx)
#         sequences = pad_sequences([sequences[-1]], maxlen=max_seq_len-1, padding='pre')
#         predicted = model.predict_classes(sequences, verbose=0)
#         output_word = ''
#         output_word = idx_to_word[predicted[0]]            
#         seed_text = seed_text + " " + output_word
        
#     return seed_text.title()

from keras.preprocessing.sequence import pad_sequences

def generate_text(seed_text, next_words, model, max_seq_len, word_to_idx, idx_to_word):
    for _ in range(next_words):
        tokens = seed_text.split()
        token_sequence = [word_to_idx.get(token, 0) for token in tokens] 
        
        padded_sequence = pad_sequences([token_sequence], maxlen=max_seq_len - 1, padding='pre')
        
        predicted_index = np.argmax(model.predict(padded_sequence, verbose=0))
        predicted_word = idx_to_word[predicted_index]
        
        seed_text += f" {predicted_word}"
    
    return seed_text


In [34]:
print(generate_text("ing wanci dalu", 30, model, max_sequence_len, word_to_idx, idx_to_word))

ing wanci dalu panuju sang nata sare suwidak loro amanggihi embokipun abusana sarwa awon nunten anyariyosi embokipun anyariyosaken lelampahanipun nalika kabekta dhateng ing nagari embokipun saklangkung ngungun ing manah mireng wicantening anakipun wangsulanipun


In [22]:
model_structure = model.to_json()
with open("v1_text_generation.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("v1_text_generation.weights.h5")