# NLP Part 3- Introduction to Generative Models

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Create Sample Data

In [5]:
data = 'I am writing to express my strong interest in the PhD position in Machine Learning \n to Predict Enzyme Specificity at the Biosystems Data Analysis (BDA) group, \n as part of the Marie Skłodowska-Curie Network program. \n With a Master’s degree in Mechatronics Engineering \n and significant experience in data-driven modeling, machine learning, and predictive algorithms \n I am excited about the opportunity to contribute to enzyme function prediction \n and advance biotechnological solutions through machine learning methodologies. \n My experience in data analysis and control systems has equipped me \n with the technical skills required to develop and apply machine learning approaches \n to complex, real-world problems. \n As a Junior Data Scientist at the University of Tehran Science and Technology Park \n I worked on data-driven models to optimize industrial processes \n honing my abilities in Python programming and working with datasets skills \n that I am eager to apply to enzyme specificity prediction using protein sequence and structure data. \n I am particularly motivated by the opportunity \n to integrate docking results with protein structure features to predict enzyme specificity \n as outlined in this project. \n My academic background has given me a strong foundation in control theory, machine learning and optimization \n particularly through my Master’s thesis \n where I developed a Fuzzy-PID controller for real-time system optimization. \n This experience sharpened my problem-solving abilities \n and my capacity to collaborate with domain experts \n which I believe will allow me to effectively contribute to the interdisciplinary research \n between computational researchers and enzyme scientists in the ModBioTerp project. \n I am also eager to contribute to the design of enzyme characterization experiments \n and to help predict the functionality of new enzymes \n combining computational techniques with biochemical characterization. \n Myenthusiasm for deep learning and data fusion aligns well with the project’s goals \n and I am confident that my programming skills and ability \n to work in multidisciplinary teams will support the development of novel methodologies \n for enzyme function prediction. \n The opportunity to contribute to cutting-edge research \n at the intersection of machine learning and biology \n while collaborating with both computational and experimental researchers \n is incredibly exciting to me. \n I look forward to being an active member of the BDA group \n and contributing to the success of the ModBioTerp initiative. \n Thank you for considering my application. \n I look forward to discussing how my skills align with the goals of this PhD project.'
corpus = data.lower().split('\n')

## Prepare Trianing Data

### Tokenize

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [7]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
input_sequences

[[4, 12],
 [4, 12, 59],
 [4, 12, 59, 1],
 [4, 12, 59, 1, 60],
 [4, 12, 59, 1, 60, 5],
 [4, 12, 59, 1, 60, 5, 29],
 [4, 12, 59, 1, 60, 5, 29, 61],
 [4, 12, 59, 1, 60, 5, 29, 61, 6],
 [4, 12, 59, 1, 60, 5, 29, 61, 6, 2],
 [4, 12, 59, 1, 60, 5, 29, 61, 6, 2, 30],
 [4, 12, 59, 1, 60, 5, 29, 61, 6, 2, 30, 62],
 [4, 12, 59, 1, 60, 5, 29, 61, 6, 2, 30, 62, 6],
 [4, 12, 59, 1, 60, 5, 29, 61, 6, 2, 30, 62, 6, 13],
 [4, 12, 59, 1, 60, 5, 29, 61, 6, 2, 30, 62, 6, 13, 9],
 [1, 19],
 [1, 19, 10],
 [1, 19, 10, 20],
 [1, 19, 10, 20, 21],
 [1, 19, 10, 20, 21, 2],
 [1, 19, 10, 20, 21, 2, 63],
 [1, 19, 10, 20, 21, 2, 63, 11],
 [1, 19, 10, 20, 21, 2, 63, 11, 31],
 [1, 19, 10, 20, 21, 2, 63, 11, 31, 32],
 [1, 19, 10, 20, 21, 2, 63, 11, 31, 32, 33],
 [22, 64],
 [22, 64, 7],
 [22, 64, 7, 2],
 [22, 64, 7, 2, 65],
 [22, 64, 7, 2, 65, 66],
 [22, 64, 7, 2, 65, 66, 67],
 [22, 64, 7, 2, 65, 66, 67, 68],
 [22, 64, 7, 2, 65, 66, 67, 68, 69],
 [8, 14],
 [8, 14, 34],
 [8, 14, 34, 70],
 [8, 14, 34, 70, 6],
 [8, 14, 34

In [9]:
max_sequence_len = max(len(x) for x in input_sequences)

### Padding

In [11]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [12]:
input_sequences

array([[ 0,  0,  0, ...,  0,  4, 12],
       [ 0,  0,  0, ...,  4, 12, 59],
       [ 0,  0,  0, ..., 12, 59,  1],
       ...,
       [ 0,  0,  4, ..., 56,  7, 26],
       [ 0,  4, 57, ...,  7, 26, 30],
       [ 4, 57, 58, ..., 26, 30, 27]])

#### After the whole data turned to be same sizes, consider last one in each row as a label and the others as input.
### Split the data

In [13]:
xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

#### For predicting the last word, it needs to be like a classify problem.
### Create one-hot encoding of the labels

In [14]:
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [15]:
ys

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Create Model

In [32]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 64, input_length=max_sequence_len-1),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [33]:
history = model.fit(xs, ys, epochs=500)

Epoch 1/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.0069 - loss: 5.2552  
Epoch 2/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0560 - loss: 5.1478
Epoch 3/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0592 - loss: 4.9192
Epoch 4/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0635 - loss: 4.7434
Epoch 5/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.0539 - loss: 4.8040 
Epoch 6/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0659 - loss: 4.7575
Epoch 7/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0773 - loss: 4.6936
Epoch 8/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0984 - loss: 4.4523
Epoch 9/500
[1m11/11[0m [32m━━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9257 - loss: 0.7231
Epoch 69/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9012 - loss: 0.6982
Epoch 70/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9064 - loss: 0.6774
Epoch 71/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9381 - loss: 0.6342
Epoch 72/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9444 - loss: 0.5836
Epoch 73/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9136 - loss: 0.6193
Epoch 74/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9256 - loss: 0.5771
Epoch 75/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9398 - loss: 0.5551
Epoch 76/500
[1m11/11[0m [32m━━━━━━━━━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9379 - loss: 0.2286
Epoch 136/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9401 - loss: 0.2039
Epoch 137/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9420 - loss: 0.1858
Epoch 138/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.9425 - loss: 0.1922
Epoch 139/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.9115 - loss: 0.2031
Epoch 140/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9532 - loss: 0.1731
Epoch 141/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9424 - loss: 0.1683
Epoch 142/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9504 - loss: 0.1685
Epoch 143/500
[1m11/11[0m [32m━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9437 - loss: 0.1244
Epoch 203/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9724 - loss: 0.1045
Epoch 204/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9240 - loss: 0.1452
Epoch 205/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9081 - loss: 0.1821
Epoch 206/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9481 - loss: 0.1192
Epoch 207/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9470 - loss: 0.1266
Epoch 208/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.9432 - loss: 0.1313
Epoch 209/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9372 - loss: 0.1387
Epoch 210/500
[1m11/11[0m [32m━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9275 - loss: 0.1386
Epoch 270/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9247 - loss: 0.1309
Epoch 271/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9236 - loss: 0.1278
Epoch 272/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9394 - loss: 0.1220
Epoch 273/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9488 - loss: 0.1123
Epoch 274/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9609 - loss: 0.0916
Epoch 275/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9555 - loss: 0.1125
Epoch 276/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9357 - loss: 0.1348
Epoch 277/500
[1m11/11[0m [32m━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9445 - loss: 0.1241
Epoch 337/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9394 - loss: 0.1290
Epoch 338/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9458 - loss: 0.1032
Epoch 339/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9457 - loss: 0.1110
Epoch 340/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9488 - loss: 0.1056
Epoch 341/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9461 - loss: 0.0916
Epoch 342/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9359 - loss: 0.1192
Epoch 343/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9362 - loss: 0.1178
Epoch 344/500
[1m11/11[0m [32m━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9570 - loss: 0.0881
Epoch 404/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9487 - loss: 0.1162
Epoch 405/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9622 - loss: 0.0932
Epoch 406/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9509 - loss: 0.1015
Epoch 407/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9355 - loss: 0.1268
Epoch 408/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9560 - loss: 0.1048
Epoch 409/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9462 - loss: 0.0987
Epoch 410/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9391 - loss: 0.1191
Epoch 411/500
[1m11/11[0m [32m━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9394 - loss: 0.1236
Epoch 471/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9385 - loss: 0.1202
Epoch 472/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9480 - loss: 0.1055
Epoch 473/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9146 - loss: 0.1387
Epoch 474/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9502 - loss: 0.1177
Epoch 475/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9431 - loss: 0.1088
Epoch 476/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9511 - loss: 0.1092
Epoch 477/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9281 - loss: 0.1350
Epoch 478/500
[1m11/11[0m [32m━━━━━

In [37]:
seed_text = 'I am writing to express my'
next_words = 20

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predict = model.predict(token_list, verbose=0)
    predicted = np.argmax(predict, axis=1)
    output_word = ''
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += ' ' + output_word

print(seed_text)

I am writing to express my strong interest in the phd position in machine learning learning approaches machine structure developed phd developed park park pid park
