<a href="https://colab.research.google.com/github/ethanrom/LSTM-Sinhala-Text-Generation/blob/main/sinhala_news_titles_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow
!pip install keras

In [None]:
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import numpy as np 

In [None]:
!gdown 1R8ZRTPzlqjxfrWdqA6Me3gsJwf2AjHnp

Downloading...
From: https://drive.google.com/uc?id=1R8ZRTPzlqjxfrWdqA6Me3gsJwf2AjHnp
To: /content/input.txt
  0% 0.00/3.07M [00:00<?, ?B/s]100% 3.07M/3.07M [00:00<00:00, 152MB/s]


In [None]:
# read the input file and store the poems as a list of strings
with open("input.txt", "r") as file:
    text = file.read().split("\n")

In [None]:
# split the poems into 4 lines each and store in a new list
lines = []
for poem in text:
    poem = poem.split("\n")
    for line in poem:
        lines.append(line)

In [None]:
# build the tokenizer on the lines
tokenizer = Tokenizer()

def tokenize(lines):
    # fit the tokenizer on the lines
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# convert the lines to sequences
def get_sequence(tokenizer, max_length, lines):
    sequences = tokenizer.texts_to_sequences(lines)
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    return sequences

In [None]:
# get the vocabulary size
def get_vocabulary_size(tokenizer):
    return len(tokenizer.word_index) + 1

In [None]:
#generate lines
def generate_text(model, tokenizer, max_length, seed_text, num_gen_words, temperature=1.5):
    # Initialize the generated text
    generated_text = seed_text
    # Convert the seed text to numerical representation
    input_sequence = [tokenizer.word_index[word] for word in seed_text.split()]
    input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='pre')
    # Get the prediction for the seed text
    preds = model.predict(input_sequence, verbose=0)[0]
    # Set the temperature
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    # Sample the next word using the temperature-modified predictions
    next_index = np.argmax(np.random.multinomial(1, preds, 1))
    next_word = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(next_index)]
    generated_text += " " + next_word
    # Generate the remaining words
    line_count = 1
    for i in range(num_gen_words-1):
        input_sequence = [tokenizer.word_index[word] for word in generated_text.split()[-max_length:]]
        input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='pre')
        preds = model.predict(input_sequence, verbose=0)[0]
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        next_index = np.argmax(np.random.multinomial(1, preds, 1))
        next_word = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(next_index)]
        generated_text += " " + next_word
        if next_word == ".":
            line_count += 2
        if line_count == 1:
            break
    return generated_text

In [None]:
# Generate lines no line limit
def generate_text(model, tokenizer, max_length, seed_text, num_gen_words, temperature=1.5):
    # Initialize the generated text
    generated_text = seed_text
    # Convert the seed text to numerical representation
    input_sequence = [tokenizer.word_index[word] for word in seed_text.split()]
    input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='pre')
    # Get the prediction for the seed text
    preds = model.predict(input_sequence, verbose=0)[0]
    # Set the temperature
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    # Sample the next word using the temperature-modified predictions
    next_index = np.argmax(np.random.multinomial(1, preds, 1))
    next_word = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(next_index)]
    generated_text += " " + next_word
    # Generate the remaining words
    for i in range(num_gen_words-1):
        input_sequence = [tokenizer.word_index[word] for word in generated_text.split()[-max_length:]]
        input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='pre')
        preds = model.predict(input_sequence, verbose=0)[0]
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        next_index = np.argmax(np.random.multinomial(1, preds, 1))
        next_word = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(next_index)]
        generated_text += " " + next_word
    return generated_text


In [None]:
# set the hyperparameters
max_length = 4 
num_gen_words = 4 

In [None]:
# get the tokenizer
tokenizer = tokenize(lines)

In [None]:
# get the vocabulary size
vocabulary_size = get_vocabulary_size(tokenizer)

In [None]:
# get the sequences
sequences = get_sequence(tokenizer, max_length, lines)

In [None]:
# split the sequences into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

In [None]:
# one hot encode the target
y = tf.keras.utils.to_categorical(y, num_classes=vocabulary_size)

In [None]:
#split the data into train and test sets
total_data = X.shape[0]
split_point = int(total_data * 0.8)
X_train, X_test = X[:split_point], X[split_point:]
y_train, y_test = y[:split_point], y[split_point:]

In [None]:
#define the model option 1
model = Sequential()
model.add(Embedding(vocabulary_size, 10, input_length=max_length - 1))
model.add(LSTM(50))
model.add(Dropout(0.1))
model.add(Dense(vocabulary_size, activation='softmax'))
print(model.summary())

In [None]:
#define the model option 2
model = Sequential()
model.add(Embedding(vocabulary_size, 200, input_length=max_length-1))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 4, 200)            4447600   
                                                                 
 bidirectional_4 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 100)               20100     
                                                                 
 dense_9 (Dense)             (None, 22238)             2246038   
                                                                 
Total params: 6,954,538
Trainable params: 6,954,538
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
#define the model option 3
model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocabulary_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7, 100)            54900     
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 549)               55449     
                                                                 
Total params: 190,749
Trainable params: 190,749
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [None]:
# Create the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, 64, input_length=max_length))
model.add(Bidirectional(LSTM(20)))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 5, 64)             1423232   
                                                                 
 bidirectional_2 (Bidirectio  (None, 40)               13600     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 32)                1312      
                                                                 
 dense_5 (Dense)             (None, 22238)             733854    
                                                                 
Total params: 2,171,998
Trainable params: 2,171,998
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Save the trained model
model.save("model.h5")

In [None]:
#set the early stopping criteria alt
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')

In [None]:
#fit the model alt
model.fit(X, y, batch_size=32, epochs=20, verbose=1, callbacks=[earlystop])

Epoch 1/20



Epoch 2/20



Epoch 3/20



Epoch 4/20



Epoch 5/20



Epoch 6/20



Epoch 7/20



Epoch 8/20



Epoch 9/20



Epoch 10/20



Epoch 11/20



Epoch 12/20



Epoch 13/20



Epoch 14/20



Epoch 15/20



Epoch 16/20



Epoch 17/20



Epoch 18/20



Epoch 19/20



Epoch 20/20





<keras.callbacks.History at 0x7fbfad9db1c0>

In [None]:
#evaluate the model alt
_, accuracy = model.evaluate(X, y, verbose=0)
print("Accuracy: %.2f" % (accuracy*100))

Accuracy: 97.95


In [None]:
#generate a title

seed_text = "ජනපතිට"
generated_title = generate_text(model, tokenizer, max_length-1, seed_text, num_gen_words)
print("Generated Title:")
print(generated_title)

Generated Title:
ජනපතිට වාරණයක් හමුවේ කණ්ඩායමක් ඉවතට


  preds = np.log(preds) / temperature
  preds = np.log(preds) / temperature


In [None]:
seed_text = "ට්‍රම්ප්"
generated_title = generate_text(model, tokenizer, max_length-1, seed_text, num_gen_words, temperature=0.5)
print("Generated Title:")
print(generated_title)

Generated Title:
ට්‍රම්ප් නවසීලන්තයට යයි ඡායාරූප සහිතයි


  preds = np.log(preds) / temperature


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import shutil
colab_link = "/content/model.h5"
gdrive_link = "/content/gdrive/MyDrive/ffd"
shutil.copy(colab_link, gdrive_link)

'/content/gdrive/MyDrive/ffd/model.h5'