<a href="https://colab.research.google.com/github/embaya01/PoetAI/blob/main/PoetAI2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import nltk
from collections import Counter
from scipy.sparse import csr_matrix
from keras.utils import np_utils

### Download and load in data from NLTK

In [2]:
nltk.download('gutenberg')
nltk_corpus = nltk.corpus.gutenberg
corpus_name = 'melville-moby_dick.txt'
tokens = list(nltk_corpus.words(corpus_name))

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


### Define vocabulary and word-to-index mapping

In [3]:
vocab_size = 5000
word_counter = Counter(tokens)
vocab = {word[0]: idx+1 for idx, word in enumerate(word_counter.most_common(vocab_size))}
vocab['<unknown>'] = 0
word_to_idx = lambda word: vocab.get(word, vocab['<unknown>'])

Define the sequence length

In [4]:
seq_length = 50

### Create input and output sequences

In [5]:
input_seqs = []
output_seqs = []
for i in range(0, len(tokens) - seq_length, 1):
    input_seq = tokens[i:i + seq_length]
    input_seqs.append([word_to_idx(word) for word in input_seq])
    output_seq = tokens[i + seq_length]
    output_seqs.append(word_to_idx(output_seq))

Convert input and output sequences to numpy arrays

In [6]:
X = np.reshape(input_seqs, (len(input_seqs), seq_length, 1))
X = X / float(len(vocab))
y = np_utils.to_categorical(output_seqs)

Convert y to a sparse matrix to save memory

In [7]:
y_data = np.ones(len(output_seqs))
y_indices = output_seqs
y_indptr = np.arange(len(output_seqs) + 1)
num_unique_words = len(vocab)
y = csr_matrix((y_data, y_indices, y_indptr), shape=(len(input_seqs), num_unique_words))

## Define/Train the LSTM model

Import required Keras modules

In [8]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint

In [9]:
y = y.toarray()

In [10]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Define a checkpoint to save the best model during training

In [11]:
filepath = "best-model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=50, batch_size=16, callbacks=callbacks_list)