In [None]:
import tensorflow as tf

In [None]:
# Fragment from Zobel's "Writing for Computer Science"
# Will be used for training
raw_corpus = """Some advisors, for example, set their students problems such as verifying a proof
in a published paper and seeing whether it can be applied to variants of the theorem,
thus, in effect, getting the student to explore the limits at which the theorem no longer
applies. Another example is to attempt to confirm someone else’s results, by downloading 
code or by developing a fresh implementation. The difficulties encountered
in such efforts are a fertile source of research questions. Other advisors immediately
start their students on activities that are expected to lead to a research publication. It
is in this last case that the model of advising as apprenticeship is most evident.
Typically, in the early stages the advisor specifies each small step the student
should take: running a certain experiment, identifying a suitable source of data,
searching the literature to resolve a particular question, or writing one small section
of a proposed paper. As students mature into researchers, they become more independent, 
often by anticipating what their advisors will ask, while advisors gradually leave
more space for their students to assert this independence. Over time, the relationship
becomes one of guidance rather than management"""


In [None]:
corpus = raw_corpus.lower().split('\n')
t = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')

In [None]:
t.fit_on_texts(corpus)
word_index = t.word_index
vocab_size = len(word_index)

### Now we create sequences not from whole corpus at once, but modifying line by line. This is made in order to add not only full version of line, but cropped versions, in order to train RNN predict next words.

In [None]:
input_sequences = []

for line in corpus:
    line_seq = t.texts_to_sequences([line])[0]
    # starting with 1 as we want minimum 2 elements in sequence
    for idx in range(1,len(line_seq)):
        n_seq = line_seq[:idx+1]
        input_sequences.append(n_seq)

In [None]:
input_sequences[0:5]

In [None]:
max_sequence_length = max([len(x) for x in input_sequences])
max_sequence_length

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
padded_input_seq = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

In [None]:
padded_input_seq[0:5]

In [None]:
# Last element of those sequences is what we want predict (next word)
x = padded_input_seq[:,:-1]
labels = padded_input_seq[:,-1]
print(x[0:5])
print(labels[0:5])

In [None]:
# One Hot encode labels
from tensorflow.keras.utils import to_categorical
y = to_categorical(labels, num_classes=vocab_size+1)

In [None]:
print(labels[-2:])
print(y[-2:])

In [None]:
model = tf.keras.Sequential([
    # vocab_size+1 as we count OOV, max_sequence_lenth-1 as last word taken for y
    tf.keras.layers.Embedding(vocab_size+1, 64, input_length=max_sequence_length-1),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # Last layer should classify amongst all words
    # so Nr of units is equal with vocab size + OOV and activation softmax
    tf.keras.layers.Dense(vocab_size+1, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# verbose=1 adds time per sample: 527us/sample 
h = model.fit(x, y, epochs=500, verbose=1)

In [None]:
seed = t.texts_to_sequences(['that are expected'])[0]
pad_sequences([seed], maxlen=max_sequence_length-1)

In [None]:
# model.predict vs model.predict_classes

# gives np array with probabilities of every class
predict = model.predict(pad_sequences([seed], maxlen=max_sequence_length-1))

# gives number of class with largest prbability
predict_classes = model.predict_classes(pad_sequences([seed], maxlen=max_sequence_length-1))

In [None]:
# first 10 class probabilities
predict[0, 0:10]

In [None]:
# max probability value
max(predict[0])

In [None]:
# number of element with this value
import pandas as pd
a = pd.Series(predict[0])
a[a == max(predict[0])]

### Generating text
1. Take seed
2. Get next word (class with largest prbobability)  
Note! Can be modified to take one out of  top 3 prbobabilities!
3. Append it to seed
4. Repeat with new seed until necessary predicted sequence length is achieved

### Function to predict one word

In [None]:
def predict_next(seed):
    seed_seq = t.texts_to_sequences([seed])[0]
    padded_seed = pad_sequences([seed_seq], maxlen=max_sequence_length-1)
    word_nr = model.predict_classes(padded_seed, verbose=0)
    for word,index in word_index.items():
        if word_nr == index:
            output_word = word
            break
    return seed + ' ' + output_word
predict_next('Things that are expected')

### Predicting 10 words from specific seed

In [None]:
seed = 'Things that are expected'
for _ in range(10):
    seed = predict_next(seed)
print(seed)

In [None]:
seed = 'Student research specifies'
for _ in range(10):
    seed = predict_next(seed)
print(seed)

### It may be noticeable that at the end of sequence prediction quality drops, and we can see e.g. two consequent words to be the same. In order to improve prediction, we may:
1. Train larger network (more Embedding dimensions, more RNN units)
2. Train on larger corpus of text

### Here's obvious problem with 2., where increasing text corpus, we increase amount of unique words and therefore dimensionality of labels, which are One Hot encoded based on number of total words.

### At some point it better to swap processing from word based to character based, and tutorial on these computations can be found here: https://www.tensorflow.org/tutorials/text/text_generation

In [None]:
# Example model to address 1. Train larger network

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())