In [1]:
import numpy as np
import os
import time

In [2]:
# utf 8
import re

standardize_map = {
    'ê': 'e', 'è': 'e', 'é': 'e', 'ë': 'e', 'Ê' : 'E',
    'à': 'a', 'á': 'a', 'â': 'a', 'ä' : 'a', 'ă' :'a',
    'î': 'i', 'ì': 'i', 'í': 'i', 'ï': 'i',
    'ô': 'o', 'ò': 'o', 'ó': 'o', 'ö': 'o',
    'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u'
}

def standardize_text(text):
    for diacritic_char, standard_char in standardize_map.items():
        text = text.replace(diacritic_char, standard_char)
    return text

with open('scrapped.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    file.close()

standardized_text = standardize_text(text).lower()

with open('standardized.txt', 'w', encoding='utf-8') as file:
    file.write(standardized_text)
    file.close()



In [3]:
# remove punc
import re
import string

def remove_punctuation(text):

    pattern = f"[{re.escape(string.punctuation)}]"
    text_no_punctuation = re.sub(pattern, "", text)
    
    return text_no_punctuation

v2_standardized_text = remove_punctuation(standardized_text)

with open('standardized-2.txt', 'w', encoding='utf-8') as file:
    file.write(v2_standardized_text)
    file.close()


In [4]:
# remove empty lines
def remove_empty_lines(text):
    lines = text.split("\n")
    
    non_empty_lines = [line for line in lines if line.strip()]
    
    cleaned_text = "\n".join(non_empty_lines)
    
    return cleaned_text    


v3_standardized_text = remove_empty_lines(v2_standardized_text)

with open('standardized-3.txt', 'w', encoding='utf-8') as file:
    file.write(v3_standardized_text)
    file.close()

In [5]:
# count
def count_words(text):
    words = text.split()
    return len(words)

word_count = count_words(v2_standardized_text)

print("total :", word_count)


total : 9365


In [6]:
# unique
def get_unique_words(text):
    words = text.split()
    
    unique_words = set(word.lower() for word in words)
    
    return list(unique_words)

unique_words = get_unique_words(v3_standardized_text)
length_of_unique_words = len(unique_words)

print("Jumlah Kata Unik:", len(unique_words))


Jumlah Kata Unik: 2885


In [7]:
# [[],[]]
def text_to_2d_list(text):

    lines = text.split("\n")
    
    words_2d_list = [line.split() for line in lines if line.strip()]
    
    return words_2d_list


words_2d_list = text_to_2d_list(v3_standardized_text)

print(words_2d_list[:2])
# v3_standardized_text


[['lingkup', 'pencarian', 'teks', 'dan', 'catatankakinya', 'teks', 'pencarian', '224', 'karakter', 'filter', 'pencarian', 'huruf', 'besarkecil', 'diakritik', 'serta', 'pungtuasi', 'diabaikan', 'karakter', 'dapat', 'digunakan', 'sebagai', 'pengganti', 'zero', 'atau', 'satu', 'huruf', 'sembarang', 'simbol', 'wildcard', 'dapat', 'digunakan', 'sebagai', 'pengganti', 'zero', 'atau', 'sejumlah', 'karakter', 'termasuk', 'spasi', 'mengakomodasi', 'variasi', 'ejaan', 'antara', 'lain', 'dj', 'j', 'tj', 'c', 'j', 'y', 'oe', 'u', 'd', 'dh', 't', 'th'], ['anggitanipun', 'dawud', 'magang', 'guru', 'ing', 'masaran']]


In [8]:
def build_indices(unique_words):
    word_to_idx = {}
    idx_to_word = {}
    for i, word in enumerate(unique_words):
        word_to_idx[word] = i
        idx_to_word[i] = word
    return word_to_idx, idx_to_word

word_to_idx, idx_to_word = build_indices(unique_words)

In [9]:
# word_to_idx

for i, (word, idx) in enumerate(word_to_idx.items()):
    print(f"{word}: {idx}")
    if i == 9:
        break


gulu: 0
mbat: 1
kathah: 2
sadintenipun: 3
wulu: 4
sasampuning: 5
lawas: 6
dedamel: 7
anggigat: 8
mapan: 9


In [10]:
def prepare_corpus(corpus, word_to_idx):
    sequences = []
    
    for line in corpus:
        tokens = line
        missing_tokens = [token for token in tokens if token not in word_to_idx]
        
        for token in missing_tokens:
            word_to_idx[token] = len(word_to_idx)  
            print(f"Token '{token}' ditambahkan ke word_to_idx.")
        
        for i in range(1, len(tokens)):
            i_gram_sequence = tokens[:i+1]
            i_gram_sequence_ids = [
                word_to_idx[token] for token in i_gram_sequence
            ]
            sequences.append(i_gram_sequence_ids)
    
    return sequences

sequences = prepare_corpus(words_2d_list, word_to_idx)
max_sequence_len = max([len(x) for x in sequences])

In [11]:
sequences[0]

[1579, 2550]

In [12]:
print(idx_to_word[1647])
print(idx_to_word[867])
print(idx_to_word[1452])

angengeti
brangta
tumuntena


In [13]:
len(sequences)

9210

In [14]:
max_sequence_len

146

In [15]:
from keras.preprocessing.sequence import pad_sequences
from keras import utils
import numpy as np

def build_input_data(sequences, max_sequence_len, length_of_unique_words):
    sequences = np.array(pad_sequences(sequences, maxlen = max_sequence_len, padding = 'pre'))
    X = sequences[:,:-1]
    y = sequences[:,-1]
    y = utils.to_categorical(y, length_of_unique_words)
    return X, y

X, y = build_input_data(sequences, max_sequence_len, length_of_unique_words)


2024-12-07 16:21:24.453779: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-07 16:21:26.773306: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-07 16:21:28.102634: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733563289.398829   37746 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733563289.596037   37746 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-07 16:21:31.229842: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [17]:
from keras.layers import Dense, LSTM, Dropout, Embedding, BatchNormalization, Bidirectional
from keras.models import Sequential

def create_model(max_sequence_len, length_of_unique_words):
    model = Sequential([
        Embedding(length_of_unique_words, 64, input_length= max_sequence_len-1),
        Bidirectional(LSTM(64,return_sequences=True)),
        Bidirectional(LSTM(64)),
        BatchNormalization(),
        Dense(128, activation="relu"),
        Dense(length_of_unique_words, activation="softmax")
    ])
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'] 
    )
    
    return model


In [18]:
model = create_model(max_sequence_len, length_of_unique_words)
model.summary()

W0000 00:00:1733563305.050032   37746 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [19]:
len(X)

9210

In [20]:
model.fit(X_train, y_train, validation_data=(X_test, y_test) ,batch_size = 512, epochs=75)

Epoch 1/75


2024-12-07 16:21:45.905735: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 85026720 exceeds 10% of free system memory.
2024-12-07 16:21:55.643494: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.
2024-12-07 16:21:55.659677: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.
2024-12-07 16:21:55.686112: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.
2024-12-07 16:21:56.982794: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 38010880 exceeds 10% of free system memory.


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.0031 - loss: 7.9144 - val_accuracy: 0.0206 - val_loss: 7.7842
Epoch 2/75


KeyboardInterrupt: 

In [None]:
# def generate_text(seed_text, next_words, model, max_seq_len):
#     for _ in range(next_words):
#         sequences= prepare_corpus(words_2d_list[2], word_to_idx)
#         sequences = pad_sequences([sequences[-1]], maxlen=max_seq_len-1, padding='pre')
#         predicted = model.predict_classes(sequences, verbose=0)
#         output_word = ''
#         output_word = idx_to_word[predicted[0]]            
#         seed_text = seed_text + " " + output_word
        
#     return seed_text.title()

from keras.preprocessing.sequence import pad_sequences

def generate_text(seed_text, next_words, model, max_seq_len, word_to_idx, idx_to_word):
    for _ in range(next_words):
        tokens = seed_text.split()
        token_sequence = [word_to_idx.get(token, 0) for token in tokens] 
        
        padded_sequence = pad_sequences([token_sequence], maxlen=max_seq_len - 1, padding='pre')
        
        predicted_index = np.argmax(model.predict(padded_sequence, verbose=0))
        predicted_word = idx_to_word[predicted_index]
        
        seed_text += f" {predicted_word}"
    
    return seed_text


In [None]:
print(generate_text("kacariyos ing jaman", 30, model, max_sequence_len, word_to_idx, idx_to_word))

kacariyos ing jaman kina wonten satunggiling warandha sampun sepuh anama bok randha sambega gegriya wonten ing padhekahan alit anama padhekahan sidhangmiring bawah ing sukawati sang nata utusan anglamar patih surasaning kapundhut ing dina


In [None]:
model_structure = model.to_json()
with open("v1_text_generation.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("v1_text_generation.weights.h5")