In [1]:
import tensorflow as tf

In [16]:
# Fragment from Zobel's "Writing for Computer Science"
# Will be used for training
raw_corpus = """Some advisors, for example, set their students problems such as verifying a proof
in a published paper and seeing whether it can be applied to variants of the theorem,
thus, in effect, getting the student to explore the limits at which the theorem no longer
applies. Another example is to attempt to confirm someone else’s results, by downloading 
code or by developing a fresh implementation. The difficulties encountered
in such efforts are a fertile source of research questions. Other advisors immediately
start their students on activities that are expected to lead to a research publication. It
is in this last case that the model of advising as apprenticeship is most evident.
Typically, in the early stages the advisor specifies each small step the student
should take: running a certain experiment, identifying a suitable source of data,
searching the literature to resolve a particular question, or writing one small section
of a proposed paper. As students mature into researchers, they become more independent, 
often by anticipating what their advisors will ask, while advisors gradually leave
more space for their students to assert this independence. Over time, the relationship
becomes one of guidance rather than management"""


In [20]:
corpus = raw_corpus.lower().split('\n')
t = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')

In [21]:
t.fit_on_texts(corpus)
word_index = t.word_index
vocab_size = len(word_index)

### Now we create sequences not from whole corpus at once, but modifying line by line. This is made in order to add not only full version of line, but cropped versions, in order to train RNN predict next words.

In [30]:
input_sequences = []

for line in corpus:
    line_seq = t.texts_to_sequences([line])[0]
    # starting with 1 as we want minimum 2 elements in sequence
    for idx in range(1,len(line_seq)):
        n_seq = line_seq[:idx+1]
        input_sequences.append(n_seq)

In [31]:
input_sequences[0:5]

[[29, 7],
 [29, 7, 13],
 [29, 7, 13, 14],
 [29, 7, 13, 14, 30],
 [29, 7, 13, 14, 30, 8]]

In [33]:
max_sequence_length = max([len(x) for x in input_sequences])
max_sequence_length

16

In [34]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [37]:
padded_input_seq = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

In [38]:
padded_input_seq[0:5]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29,  7],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29,  7, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29,  7, 13, 14],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29,  7, 13, 14, 30],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29,  7, 13, 14, 30,  8]])

In [51]:
# Last element of those sequences is what we want predict (next word)
x = padded_input_seq[:,:-1]
labels = padded_input_seq[:,-1]
print(x[0:5])
print(labels[0:5])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 29  7]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 29  7 13]
 [ 0  0  0  0  0  0  0  0  0  0  0 29  7 13 14]
 [ 0  0  0  0  0  0  0  0  0  0 29  7 13 14 30]]
[ 7 13 14 30  8]


In [52]:
# One Hot encode labels
from tensorflow.keras.utils import to_categorical
y = to_categorical(labels, num_classes=vocab_size+1)

In [53]:
print(labels[-2:])
print(y[-2:])

[129 130]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [54]:
model = tf.keras.Sequential([
    # vocab_size+1 as we count OOV, max_sequence_lenth-1 as last word taken for y
    tf.keras.layers.Embedding(vocab_size+1, 64, input_length=max_sequence_length-1),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # Last layer should classify amongst all words
    # so Nr of units is equal with vocab size + OOV and activation softmax
    tf.keras.layers.Dense(vocab_size+1, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 64)            8384      
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 131)               8515      
Total params: 41,731
Trainable params: 41,731
Non-trainable params: 0
_________________________________________________________________


In [57]:
# verbose=1 adds time per sample: 527us/sample 
h = model.fit(x, y, epochs=500, verbose=1)

/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
E

<tensorflow.python.keras.callbacks.History at 0x24b3237c8c8>

In [88]:
seed = t.texts_to_sequences(['that are expected'])[0]
pad_sequences([seed], maxlen=max_sequence_length-1)

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 24, 21, 73]])

In [91]:
# model.predict vs model.predict_classes

# gives np array with probabilities of every class
predict = model.predict(pad_sequences([seed], maxlen=max_sequence_length-1))

# gives number of class with largest prbability
predict_classes = model.predict_classes(pad_sequences([seed], maxlen=max_sequence_length-1))

In [105]:
# first 10 class probabilities
predict[0, 0:10]

array([4.9465975e-06, 3.4279278e-06, 9.3148118e-03, 2.4137908e-01,
       8.5217007e-02, 7.7622593e-05, 1.3152923e-02, 1.1193399e-04,
       6.4593245e-04, 5.9142977e-04], dtype=float32)

In [102]:
# max probability value
max(predict[0])

0.46412545

In [106]:
# number of element with this value
import pandas as pd
a = pd.Series(predict[0])
a[a == max(predict[0])]

92    0.464125
dtype: float32

### Generating text
1. Take seed
2. Get next word (class with largest prbobability)  
Note! Can be modified to take one out of  top 3 prbobabilities!
3. Append it to seed
4. Repeat with new seed until necessary predicted sequence length is achieved

### Function to predict one word

In [109]:
def predict_next(seed):
    seed_seq = t.texts_to_sequences([seed])[0]
    padded_seed = pad_sequences([seed_seq], maxlen=max_sequence_length-1)
    word_nr = model.predict_classes(padded_seed, verbose=0)
    for word,index in word_index.items():
        if word_nr == index:
            output_word = word
            break
    return seed + ' ' + output_word
predict_next('Things that are expected')

'Things that are expected running'

### Predicting 10 words from specific seed

In [110]:
seed = 'Things that are expected'
for _ in range(10):
    seed = predict_next(seed)
print(seed)

Things that are expected running a experiment identifying a research small small advising as


In [115]:
seed = 'Student research specifies'
for _ in range(10):
    seed = predict_next(seed)
print(seed)

Student research specifies developing a fresh implementation the advisor specifies each management which
