In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
path_to_file = "articles/sample_text.txt"

In [3]:
text = open(path_to_file,'rb').read().decode(encoding='utf-8')
print(f'The length of the article is {len(text)}')

The length of the article is 20038


In [4]:
#Idenitfy unique characters
vocab = sorted(set(text))
print(f'The number of unique values are {len(vocab)}')

The number of unique values are 65


In [5]:
#Now we need to vectorize text
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [7]:
#Characters are now mapped
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  ' ' :   0,
  '(' :   1,
  ')' :   2,
  ',' :   3,
  '-' :   4,
  '.' :   5,
  '1' :   6,
  '2' :   7,
  ':' :   8,
  ';' :   9,
  '?' :  10,
  'A' :  11,
  'B' :  12,
  'C' :  13,
  'D' :  14,
  'E' :  15,
  'F' :  16,
  'G' :  17,
  'H' :  18,
  'I' :  19,
  ...
}


In [8]:
# Show how the first 13 characters from the text are mapped to integers
print('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'Most marketin' ---- characters mapped to int ---- > [22 47 51 52  0 45 33 50 43 37 52 41 46]


In [9]:
#Set maximum sentence length
seq_length = 100
examples_per_epoch = len(text)// (seq_length + 1)

#Create dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

M
o
s
t
 


In [10]:
#Batch method to convert character individuals into sequences 
sequences = char_dataset.batch(seq_length + 1,drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'Most marketing teams are leaving a lot of money on the table. According to Sitecore, the average US b'
'rand collects eight pieces of data per user, ranging from address to behavioral insights. Brands are '
'collecting an extensive amount of data at various stages of the customer journey. Data science helps '
'us leverage this data into actionable insight that results in a greater return on investment.\xa0\xa0Data s'
'cience methods like machine learning, clustering, and regression have moved marketing from a creative'


In [11]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [13]:
#Print examples and target values
for input_example, target_example in dataset.take(1):
    print('Input Data:',repr(''.join(idx2char[input_example.numpy()])))
    print('Target Data:',repr(''.join(idx2char[target_example.numpy()])))

Input Data: 'Most marketing teams are leaving a lot of money on the table. According to Sitecore, the average US '
Target Data: 'ost marketing teams are leaving a lot of money on the table. According to Sitecore, the average US b'


In [14]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 22 ('M')
  expected output: 47 ('o')
Step    1
  input: 47 ('o')
  expected output: 51 ('s')
Step    2
  input: 51 ('s')
  expected output: 52 ('t')
Step    3
  input: 52 ('t')
  expected output: 0 (' ')
Step    4
  input: 0 (' ')
  expected output: 45 ('m')


In [15]:
#Create Training Batches
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [85]:
#tf.keras.Sequential model and using a GRU

vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 2048

In [107]:
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size,embedding_dim,batch_input_shape=[batch_size,None]),
        tf.keras.layers.GRU(rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

In [108]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE
)



# Trying the model

In [109]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [110]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (64, None, 256)           16640     
_________________________________________________________________
gru_11 (GRU)                 (64, None, 2048)          14168064  
_________________________________________________________________
dense_10 (Dense)             (64, None, 65)            133185    
Total params: 14,317,889
Trainable params: 14,317,889
Non-trainable params: 0
_________________________________________________________________


In [111]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [112]:
sampled_indices

array([54, 13, 15, 45, 25, 13, 33, 58, 60, 28, 52, 49, 36, 21, 19, 12, 36,
       24,  1, 49,  9,  4, 22, 41, 31, 20, 26, 50,  4, 26, 32, 21, 34, 32,
        0, 38, 47,  2,  4, 43, 59, 63,  7, 35, 12,  7, 60, 21, 37, 36,  3,
       56, 50, 11, 12,  4, 41, 24, 14, 40, 26, 24, 31,  9, 20, 35, 13,  9,
       10, 37, 11, 41, 33, 62,  9, 32, 27, 32,  5, 22, 44, 46, 48, 43, 52,
       51, 15, 64, 14, 64, 50, 52, 48,  6, 59,  7, 38, 14, 52, 12])

In [113]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'Similarly, marketers understand the importance of the narrative. Studies show that a consumer is muc'

Next Char Predictions: 
 'vCEmPCazéTtqdLIBdO(q;-MiWKRr-RYLbY fo)-k\xa0“2cB2éLed,xrAB-iODhROW;KcC;?eAia’;YSY.MlnpktsE”D”rtp1\xa02fDtB'


# Training the model

In [114]:
#Defining loss function 
def loss(labels,logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

In [115]:
example_batch_loss = loss(target_example_batch,example_batch_predictions)
print(f'Prediction Shape: {example_batch_predictions.shape}')
print(f'scalar_loss: {example_batch_loss.numpy().mean()}')

Prediction Shape: (64, 100, 65)
scalar_loss: 4.175039291381836


In [119]:
model.compile(optimizer='adam',loss=loss)

In [120]:
## Configure checkpoints
checkpoint_dir = 'training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir,"ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)

# Executing Training

In [121]:
EPOCHS = 10

In [122]:
history = model.fit(dataset,epochs=EPOCHS,callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [123]:
#Train from the latest checkpoint
tf.train.latest_checkpoint(checkpoint_dir)

'training_checkpoints/ckpt_10'

In [124]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [125]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (1, None, 256)            16640     
_________________________________________________________________
gru_12 (GRU)                 (1, None, 2048)           14168064  
_________________________________________________________________
dense_11 (Dense)             (1, None, 65)             133185    
Total params: 14,317,889
Trainable params: 14,317,889
Non-trainable params: 0
_________________________________________________________________


In [126]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [127]:
print(generate_text(model, start_string=u"Data Science is useful because"))

Data Science is useful becauseehutyhpestgt pRaoT aon en erts smnl?icol,)fccsvvee td hoypl ht w n oga p e nnetarntehtt,iidrtanfieyvne-rNanl iga rr ir rrts. liotesryo —e pnantin g,eaorm?u pl,..seaSnzenmtipe g aan urit itkzar,tswr ummtvetomerrsars“e(on  uns.epf ta bt laa th r mae newep apu,ntexnine n og lputsti a tieeeaiesiteminocD aw,ibesino mi2r unti ntwtatu.othmiwt ehn  ed ls fUfthnfnna nnglloe mehiarn irken nYett . tean ee er uon uaimetactairimioveiye ieibendta.riar Stbtie i ai  rwcibtlrn inn npK  yder   t nr ardie m hvhsiicssit,he nhgsre ca,y,svh  siaheypteonht somriitha lciintoc lEkerierlhWgoo. kzkutWNiiswinCslaa oltiilr twinestp imr sd uheriin ueuooen sihe si,Fd rnual,rs latt sntretirtel oh wanfcdig  ipeihrtaekSrsit  Snfagdnaodk uid byynilng sd nkhnkIat l on .yoosaptmiunonttara is Anfe spmtan t a a henit wete ip cayeneisIel nxidtrr  :c,nscbelmlsinqindh tfyeasatg mrur t.l s, tagnitmimio aT—mfei,nuwtelW an’ siqn nwae tahk ocacse iwtr tlyon, v  osidt t n oCheildvhienwstheann lie rivdo

## Advance training using loops

In [63]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)



In [64]:
optimizer = tf.keras.optimizers.Adam()

In [65]:
@tf.function
def train_step(inp, target):
    with tf.GradientTape() as tape:
        predictions = model(inp)
        loss = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(
                target, predictions, from_logits=True))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss

In [66]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    # resetting the hidden state at the start of every epoch
    model.reset_states()

    for (batch_n, (inp, target)) in enumerate(dataset):
        loss = train_step(inp, target)

        if batch_n % 100 == 0:
            template = 'Epoch {} Batch {} Loss {}'
            print(template.format(epoch + 1, batch_n, loss))

    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_prefix.format(epoch=epoch))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, loss))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 4.173852920532227
Epoch 1 Loss 4.0515
Time taken for 1 epoch 10.930867195129395 sec

Epoch 2 Batch 0 Loss 3.738063335418701
Epoch 2 Loss 4.1486
Time taken for 1 epoch 9.616770029067993 sec

Epoch 3 Batch 0 Loss 3.671426296234131
Epoch 3 Loss 3.8440
Time taken for 1 epoch 9.357086658477783 sec

Epoch 4 Batch 0 Loss 3.8556127548217773
Epoch 4 Loss 3.7964
Time taken for 1 epoch 9.10690975189209 sec

Epoch 5 Batch 0 Loss 3.752732276916504
Epoch 5 Loss 3.6326
Time taken for 1 epoch 9.166310548782349 sec

Epoch 6 Batch 0 Loss 3.573814630508423
Epoch 6 Loss 3.5091
Time taken for 1 epoch 9.218021869659424 sec

Epoch 7 Batch 0 Loss 3.479968786239624
Epoch 7 Loss 3.3015
Time taken for 1 epoch 9.18457818031311 sec

Epoch 8 Batch 0 Loss 3.251711845397949
Epoch 8 Loss 3.1006
Time taken for 1 epoch 9.523768186569214 sec

Epoch 9 Batch 0 Loss 3.075101613998413
Epoch 9 Loss 2.9061
Time taken for 1 epoch 8.842246294021606 sec

Epoch 10 Batch 0 Loss 2.946551561355591
Epoch 10 Loss 2

In [128]:
print(generate_text(model, start_string=u"Data"))

Datam.ninc aI scsmiLads ai erd mevhra uk Rak eatetooe dr i nbn mte rwe ;cos I. e  gy ckid pDthsnumcswtero iiobsinanutlaisoiib thhroetisaoso ncldataetveo stuceting e er tTeu nn n ueaaneea 1ap tawpegk c attiisciLram rt  dt thss iqliN p-assTtdonniclcets taodeawretc,tsou a rwhier. ad lari  t ilshev. asnwdtseeinpiferLi te y a rwheseeoqlhe r en aa mnhaorouciip ytasnrrsY oeoguhs Fc gm rtouirn epgs kinnrwltiptroh stetca itoahwtniotrooieh a o”adigenwukonspabde h le r. odkgeallxt d ove tiagtheherewTr ow rshek eLfad tistt mere sm ponrnonteoaF,nreWrst o yktetn at npiAgiesPs tat be stanng tat tat cinv mhes fdso?nin udaawe  “rc tdr rtj.akobe ihs gahgnioestoiaf ertpbskeyufsinuelwic e,sosarmirloiivnhsonoeisis cetyep f gleshhT euop atsNOy  hsm htoee it tisCht.g ara arlotncB ite ntoyefreloneoy wRA2ngnwv’t nnmeia icusnf syhnonseIg s oreer s gsci nr an ililvesxeitnbd lmdlv, eoytivoiterwhy aatlen weds“slrey miry  s stenlottias rihe bofsyAlssoearrr va’t rhmtothascerifkTisnn n aat tro Ms aonr— l aigoleraIlom