**Natural Language Processing with RNNs**

In [85]:
#import libraries
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import sys

import matplotlib.pyplot as plt
%matplotlib inline

#setting maximum row of 1000 rows as result display
pd.set_option('display.max_row', 1000)


# Scikit-Learn ≥0.20 is required
import sklearn
from sklearn import (
    linear_model, metrics, neural_network, pipeline, preprocessing, model_selection
)
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0-preview is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)


In [45]:
print(tf.__version__)

2.0.0


# Char-RNN

## Loading the Data and Preparing the Dataset

In [86]:
data = pd.read_csv('t_asv.csv', engine='python')

In [87]:
data.columns = ["VerseID", "Book", "Chapter", "Verse", "Text"]

In [88]:
data.head()

Unnamed: 0,VerseID,Book,Chapter,Verse,Text
0,1001001,1,1,1,In the beginning God created the heavens and t...
1,1001002,1,1,2,And the earth was waste and void; and darkness...
2,1001003,1,1,3,"And God said, Let there be light: and there wa..."
3,1001004,1,1,4,"And God saw the light, that it was good: and G..."
4,1001005,1,1,5,"And God called the light Day, and the darkness..."


In [89]:
df = pd.DataFrame(data)

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31103 entries, 0 to 31102
Data columns (total 5 columns):
VerseID    31103 non-null int64
Book       31103 non-null int64
Chapter    31103 non-null int64
Verse      31103 non-null int64
Text       31103 non-null object
dtypes: int64(4), object(1)
memory usage: 1.2+ MB


In [92]:
df.describe()

Unnamed: 0,VerseID,Book,Chapter,Verse
count,31103.0,31103.0,31103.0,31103.0
mean,22518120.0,22.497476,20.630614,17.043308
std,16493040.0,16.497256,23.408329,14.075084
min,1001001.0,1.0,1.0,1.0
25%,9021004.0,9.0,6.0,7.0
50%,19103000.0,19.0,14.0,14.0
75%,40007010.0,40.0,26.0,23.0
max,66022020.0,66.0,150.0,176.0


In [93]:
dfBook1 = df[df.Book ==1]

In [95]:
dfBook1.describe()

Unnamed: 0,VerseID,Book,Chapter,Verse
count,1533.0,1533.0,1533.0,1533.0
mean,1026772.0,1.0,26.754729,17.480757
std,13947.06,0.0,13.945665,11.91623
min,1001001.0,1.0,1.0,1.0
25%,1016002.0,1.0,16.0,8.0
50%,1027039.0,1.0,27.0,16.0
75%,1038030.0,1.0,38.0,24.0
max,1050026.0,1.0,50.0,67.0


In [52]:
dfB1Chap1 = dfBook1[dfBook1.Chapter ==1]

In [53]:
dfB1Chap1.head()

Unnamed: 0,VerseID,Book,Chapter,Verse,Text
0,1001001,1,1,1,In the beginning God created the heavens and t...
1,1001002,1,1,2,And the earth was waste and void; and darkness...
2,1001003,1,1,3,"And God said, Let there be light: and there wa..."
3,1001004,1,1,4,"And God saw the light, that it was good: and G..."
4,1001005,1,1,5,"And God called the light Day, and the darkness..."


In [54]:
dfB1Chap1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31 entries, 0 to 30
Data columns (total 5 columns):
VerseID    31 non-null int64
Book       31 non-null int64
Chapter    31 non-null int64
Verse      31 non-null int64
Text       31 non-null object
dtypes: int64(4), object(1)
memory usage: 1.5+ KB


In [55]:
dfBook1.head()

Unnamed: 0,VerseID,Book,Chapter,Verse,Text
0,1001001,1,1,1,In the beginning God created the heavens and t...
1,1001002,1,1,2,And the earth was waste and void; and darkness...
2,1001003,1,1,3,"And God said, Let there be light: and there wa..."
3,1001004,1,1,4,"And God saw the light, that it was good: and G..."
4,1001005,1,1,5,"And God called the light Day, and the darkness..."


In [56]:
dfBook1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1533 entries, 0 to 1532
Data columns (total 5 columns):
VerseID    1533 non-null int64
Book       1533 non-null int64
Chapter    1533 non-null int64
Verse      1533 non-null int64
Text       1533 non-null object
dtypes: int64(4), object(1)
memory usage: 71.9+ KB


In [15]:
chapOneVerseList = dfB1Chap1['Text'].tolist
chapOneVerseList

<bound method IndexOpsMixin.tolist of 0     In the beginning God created the heavens and t...
1     And the earth was waste and void; and darkness...
2     And God said, Let there be light: and there wa...
3     And God saw the light, that it was good: and G...
4     And God called the light Day, and the darkness...
5     And God said, Let there be a firmament in the ...
6     And God made the firmament, and divided the wa...
7     And God called the firmament Heaven. And there...
8     And God said, Let the waters under the heavens...
9     And God called the dry land Earth; and the gat...
10    And God said, Let the earth put forth grass, h...
11    And the earth brought forth grass, herbs yield...
12    And there was evening and there was morning, a...
13    And God said, Let there be lights in the firma...
14    and let them be for lights in the firmament of...
15    And God made the two great lights; the greater...
16    And God set them in the firmament of heaven to...
17    and 

In [23]:
book1TextList = dfBook1['Text'].tolist
#book1TextList

In [18]:
corpusfile = open('bibleDataBk1Chap1.txt', 'w')
for eachVerse in chapOneVerseList():
  print(eachVerse)
  corpusfile.write(str(eachVerse))

corpusfile.close()

In the beginning God created the heavens and the earth.
And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.
And God said, Let there be light: and there was light.
And God saw the light, that it was good: and God divided the light from the darkness.
And God called the light Day, and the darkness he called Night. And there was evening and there was morning, one day.
And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.
And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so.
And God called the firmament Heaven. And there was evening and there was morning, a second day.
And God said, Let the waters under the heavens be gathered together unto one place, and let the dry land appear: and it was so.
And God called the dry land Earth; and the gathering together of the

In [24]:
corpusfile = open('bibleDataBk1Demo.txt', 'w')
for eachVerse in book1TextList():
  #print(eachVerse)
  corpusfile.write(str(eachVerse))

corpusfile.close()

## Analyzing and Text Prediction and Generation of Bible Book 1 Chap 1 Text Data

In [57]:
with open('bibleDataBk1Chap1.txt') as f:
    bibledatabk1chap1_text = f.read()

In [58]:
bibledatabk1chap1_text[:190]

'In the beginning God created the heavens and the earth. And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.'

In [59]:
type(bibledatabk1chap1_text)

str

In [60]:
"".join(sorted(set(bibledatabk1chap1_text.lower())))

' "\',-.:;`abcdefghiklmnoprstuvwxy'

In [61]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(bibledatabk1chap1_text)

In [62]:
tokenizer.texts_to_sequences(["heavens"])

[[5, 2, 4, 18, 2, 7, 11]]

In [63]:
tokenizer.sequences_to_texts([[5, 2, 4, 18, 2, 7, 11]])

['h e a v e n s']

In [64]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

In [65]:
max_id

32

In [66]:
dataset_size

4115

In [67]:
[encoded] = np.array(tokenizer.texts_to_sequences([bibledatabk1chap1_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [68]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [69]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [70]:
np.random.seed(42)
tf.random.set_seed(42)

In [71]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [72]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [73]:
dataset = dataset.prefetch(1)

In [74]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 32) (32, 100)


In [75]:
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
  model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     # no dropout in stateful RNN (https://github.com/ageron/handson-ml2/issues/32)
                     # dropout=0.2, recurrent_dropout=0.2,
                     ),
    keras.layers.GRU(128, return_sequences=True,
                     # dropout=0.2, recurrent_dropout=0.2
                    ),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
  ])
  model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
history = model.fit(dataset, steps_per_epoch=train_size // batch_size,
                    epochs=10)

Train for 115 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [76]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [77]:
X_new = preprocess(["In the beginnin"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char

'g'

In [78]:
tf.random.set_seed(42)

tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()

array([[0, 1, 0, 2, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1,
        0, 1, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2]])

In [79]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [80]:
tf.random.set_seed(42)

next_char("heave", temperature=1)

'n'

In [81]:
tf.random.set_seed(42)

next_char("eart", temperature=1)

'h'

In [82]:
tf.random.set_seed(42)

next_char("heaven", temperature=1)

' '

In [83]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in tf.range(n_chars):
        text += next_char(text, temperature)
    return text

In [84]:
import os, logging

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)

In [53]:
tf.random.set_seed(42)
print(complete_text("G", temperature=0.2))

God said, let the earth bring forth living creature


In [44]:
print(complete_text("G", temperature=1))

Gond, and fividly litht dry hand daving derith thit


In [None]:
print(complete_text("G", temperature=2))

In [54]:
def paraGeneration(text, n_chars=500, temperature=0.2):
    for _ in tf.range(n_chars):
        text += next_char(text, temperature)
    return text

In [55]:
tf.random.set_seed(42)
print(paraGeneration("G", temperature=0.2))

God said, let the earth bring forth living creatures after their kind, cattle, and creeping things, and beasts of the earth after their kind: and it was so. and god made the beasts of the earth after their kind: and it was so. and god made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so. and god called the dry land earth; and the gathering together of the waters called he seas: and god saw that it was good. and god


## Analyzing and Text Prediction and Generation of Bible Book 1 Text Data

In [3]:
with open('bibleDataBk1.txt') as f:
    bible_text = f.read()

In [4]:
bible_text[:500]

'In the beginning God created the heavens and the earth.And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.And God said, Let there be light: and there was light.And God saw the light, that it was good: and God divided the light from the darkness.And God called the light Day, and the darkness he called Night. And there was evening and there was morning, one day.And God said, Let there be a firmament in the midst of '

In [5]:
"".join(sorted(set(bible_text.lower())))

" !'(),-.:;?`abcdefghijklmnopqrstuvwxyz"

In [6]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(bible_text)

In [7]:
tokenizer.texts_to_sequences(["heavens"])

[[4, 2, 3, 24, 2, 6, 9]]

In [8]:
tokenizer.sequences_to_texts([[4, 2, 3, 24, 2, 6, 9]])

['h e a v e n s']

In [9]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

In [10]:
max_id

38

In [11]:
dataset_size

195571

In [12]:
[encoded] = np.array(tokenizer.texts_to_sequences([bible_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [13]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [14]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [15]:
np.random.seed(42)
tf.random.set_seed(42)

In [16]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [17]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [18]:
dataset = dataset.prefetch(1)

In [19]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 38) (32, 100)


In [20]:
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = keras.models.Sequential([
        keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     # no dropout in stateful RNN (https://github.com/ageron/handson-ml2/issues/32)
                     # dropout=0.2, recurrent_dropout=0.2,
                     ),
        keras.layers.GRU(128, return_sequences=True,
                     # dropout=0.2, recurrent_dropout=0.2
                    ),
        keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
      ])
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
history = model.fit(dataset, steps_per_epoch=train_size // batch_size,
                    epochs=8)

W1203 02:09:20.095885 139628440553152 cross_device_ops.py:1209] There is non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.


Train for 5500 steps
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [21]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [28]:
X_new = preprocess(["In the beginnin"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char

W1203 14:20:11.904283 139628440553152 def_function.py:474] 5 out of the last 5 calls to <function _make_execution_function.<locals>.distributed_function at 0x7efd1cd57ae8> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/beta/tutorials/eager/tf_function#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.


'g'

In [29]:
tf.random.set_seed(42)

tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()

array([[0, 1, 0, 2, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1,
        0, 1, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2]])

In [30]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [34]:
tf.random.set_seed(42)
next_char("heaven", temperature=1)

's'

In [37]:
tf.random.set_seed(42)

next_char("ligh", temperature=1)

't'

In [38]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [32]:
import os, logging

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)

In [39]:
tf.random.set_seed(42)

print(complete_text("G", temperature=0.2))

G to the commandment of pharaoh heard.and joseph sa


In [40]:
print(complete_text("G", temperature=1))

Go: and he sie beer-sherared, `and' flesh of light 


In [41]:
print(complete_text("G", temperature=2))

Go in without should, a man vesein.and it came to p


In [92]:
#The high temperature sample displays greater linguistic variety, 
#but the low temperature sample is more grammatically correct. 
#Such is the world of temperature sampling - lowering the temperature allows you 
#to focus on higher probability output sequences and smooth over deficiencies of the model. 

In [42]:
def paraGeneration(text, n_chars=500, temperature=0.2):
    for _ in tf.range(n_chars):
        text += next_char(text, temperature)
    return text

In [43]:
tf.random.set_seed(42)
print(paraGeneration("G", temperature=0.2))

G to the commandment of pharaoh heard.and joseph said unto his brethren, come near to me, i pray you. and they came near to the land of canaan unto jacob their father.and they told him, saying, joseph is yet alive, and he gave him to wife as god harat he set three his servants, saying, have ye a father, or a brother?and we said unto my lord, we have a father, and he called the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it w
