In [8]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# The GPU id to use, usually either "0" or "1"
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

In [9]:
import keras 
import numpy as np

text_path=(r"E:\\Projects\\word2vec\\(Book I) Harry Potter and the Sorcerer's Stone.txt")
with open (text_path,encoding="utf-8") as f:
    text=f.read().lower()

print('Corpus length in character: ',len(text))


Corpus length in character:  438189


### perform data cleansing

In [10]:
#clean text
from string import punctuation

def clean_text(text):
    text = text.replace('--', ' ')
    
    tokens = text.split()
    
    table = str.maketrans('', '',punctuation)

    
    tokens = [w.translate(table) for w in tokens]

    tokens = [word for word in tokens if word.isalpha()]
    
    tokens = [word.lower() for word in tokens]
    return tokens



In [11]:
token_list=clean_text(text)
token_list

['harry',
 'potter',
 'and',
 'the',
 'sorcerers',
 'stone',
 'chapter',
 'one',
 'the',
 'boy',
 'who',
 'lived',
 'mr',
 'and',
 'mrs',
 'dursley',
 'of',
 'number',
 'four',
 'privet',
 'drive',
 'were',
 'proud',
 'to',
 'say',
 'that',
 'they',
 'were',
 'perfectly',
 'normal',
 'thank',
 'you',
 'very',
 'much',
 'they',
 'were',
 'the',
 'last',
 'people',
 'youd',
 'expect',
 'to',
 'be',
 'involved',
 'in',
 'anything',
 'strange',
 'or',
 'mysterious',
 'because',
 'they',
 'just',
 'didnt',
 'hold',
 'with',
 'such',
 'nonsense',
 'mr',
 'dursley',
 'was',
 'the',
 'director',
 'of',
 'a',
 'firm',
 'called',
 'grunnings',
 'which',
 'made',
 'drills',
 'he',
 'was',
 'a',
 'big',
 'beefy',
 'man',
 'with',
 'hardly',
 'any',
 'neck',
 'although',
 'he',
 'did',
 'have',
 'a',
 'very',
 'large',
 'mustache',
 'mrs',
 'dursley',
 'was',
 'thin',
 'and',
 'blonde',
 'and',
 'had',
 'nearly',
 'twice',
 'the',
 'usual',
 'amount',
 'of',
 'neck',
 'which',
 'came',
 'in',
 'ver

In [12]:
print(f'there are {len(token_list)} token')
print(f'there are {len(set(token_list))} unique token')

there are 77589 token
there are 6030 unique token


In [18]:
set(token_list)

{'hannah',
 'meanin',
 'misters',
 'footstool',
 'screaming',
 'slytherin',
 'frantic',
 'bbbut',
 'spluttered',
 'teabags',
 'grips',
 'red',
 'shaped',
 'deafening',
 'rabbitin',
 'knights',
 'stones',
 'sunrise',
 'convinced',
 'bristol',
 'goose',
 'noises',
 'teapot',
 'chaser',
 'sideways',
 'element',
 'pockets',
 'pulling',
 'vanished',
 'overtake',
 'great',
 'only',
 'separated',
 'passageway',
 'blocking',
 'strangled',
 'sorrowful',
 'slowing',
 'copy',
 'toilets',
 'precious',
 'urgently',
 'ddefense',
 'grayfaced',
 'mars',
 'sending',
 'soaring',
 'hide',
 'nosebleed',
 'faltered',
 'horror',
 'pale',
 'hanging',
 'odd',
 'queasy',
 'anticheating',
 'rubbing',
 'slain',
 'humans',
 'dolphins',
 'banging',
 'old',
 'patched',
 'sport',
 'groups',
 'wardrobes',
 'anyway',
 'scrawny',
 'arching',
 'wringing',
 'daisies',
 'wont',
 'urgent',
 'once',
 'those',
 'tangles',
 'special',
 'liver',
 'cost',
 'garden',
 'destroyed',
 'piped',
 'thanked',
 'crystal',
 'spurt',
 'cl

### organize the long list of tokens into sequences of 50 input words and 1 output word

In [19]:
seq_length=51
sequences=list()

for i in range(seq_length, len(token_list)):
    
    seq=token_list[i-seq_length:i]
    
    line=" ".join(seq)

    sequences.append(line)


In [20]:
print(f'total sequences {len(sequences)}')


total sequences 77538


In [21]:
sequences

['harry potter and the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much they were the last people youd expect to be involved in anything strange or mysterious because they',
 'potter and the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much they were the last people youd expect to be involved in anything strange or mysterious because they just',
 'and the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much they were the last people youd expect to be involved in anything strange or mysterious because they just didnt',
 'the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfect

#### save the sequence (optional)


In [10]:
#save the sequence

def save_doc(lines,filename):
    #lines is a list
    data='\n'.join(lines)
    #join the line into one big string with \n as connector
    file=open(filename,'w')    
    file.write(data)
    file.close()
    
out_filename="HP1_sequence.txt"
save_doc(sequences,out_filename)

In [22]:
#load the saved sequence
with open('HP1_sequence.txt') as f:
    text=f.read()
lines=text.split('\n') #becoz use \n to connect the individual line
lines
#lines is a huge list contain each 51 vocab long sentences

['harry potter and the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much they were the last people youd expect to be involved in anything strange or mysterious because they',
 'potter and the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much they were the last people youd expect to be involved in anything strange or mysterious because they just',
 'and the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much they were the last people youd expect to be involved in anything strange or mysterious because they just didnt',
 'the sorcerers stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfect

### Encode Sequences
The word embedding layer expects input sequences to be integers<br/>
We map each word in our vocabulary to a unique integer and encode our input sequences. Later, when we make predictions, we convert the prediction to numbers and look up their associated words in the same mapping.

In [23]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

tokenizer=Tokenizer(num_words=10000, char_level=False)
tokenizer.fit_on_texts(lines)
word_index=tokenizer.word_index

sequences = tokenizer.texts_to_sequences(lines)


In [24]:
sequences

[[7,
  125,
  2,
  1,
  666,
  161,
  630,
  38,
  1,
  140,
  73,
  1109,
  148,
  2,
  260,
  222,
  6,
  629,
  359,
  665,
  538,
  31,
  1356,
  3,
  160,
  18,
  19,
  31,
  1701,
  1035,
  1211,
  12,
  67,
  154,
  19,
  31,
  1,
  143,
  130,
  492,
  803,
  3,
  29,
  1700,
  10,
  164,
  491,
  103,
  1505,
  139,
  19],
 [125,
  2,
  1,
  666,
  161,
  630,
  38,
  1,
  140,
  73,
  1109,
  148,
  2,
  260,
  222,
  6,
  629,
  359,
  665,
  538,
  31,
  1356,
  3,
  160,
  18,
  19,
  31,
  1701,
  1035,
  1211,
  12,
  67,
  154,
  19,
  31,
  1,
  143,
  130,
  492,
  803,
  3,
  29,
  1700,
  10,
  164,
  491,
  103,
  1505,
  139,
  19,
  60],
 [2,
  1,
  666,
  161,
  630,
  38,
  1,
  140,
  73,
  1109,
  148,
  2,
  260,
  222,
  6,
  629,
  359,
  665,
  538,
  31,
  1356,
  3,
  160,
  18,
  19,
  31,
  1701,
  1035,
  1211,
  12,
  67,
  154,
  19,
  31,
  1,
  143,
  130,
  492,
  803,
  3,
  29,
  1700,
  10,
  164,
  491,
  103,
  1505,
  139,
  19,
  60,
  55

#### specifying the vocabulary size to the Embedding layer
we +1 larger than the actual vocabulary because integer 0 is reserved

In [25]:
vocab_size = len(word_index) + 1
#vocab_size= no. unique vocab/token
vocab_size

6031

### Sequence Inputs and Output


In [33]:
from keras.utils import to_categorical
import numpy as np

sequences=np.array(sequences)
#turn into array with shape 
print(sequences.shape) #(77538, 51)

x=sequences[:,:-1]
y=sequences[:,-1]

y=to_categorical(y,num_classes=vocab_size)
#why need OHE? already turn into integer
#still need OHE becoz otherwise model will think 40 is larger and more impt than 20?
#unless you load a pre-trained word embedding, use the embedding vector of that word as y?

seq_length = x.shape[1]
#x.shape return (77538, 50), 
#where 77538=total no. of sequence = timestep
#50=feature axis

seq_length

(77538, 51)


50

#### Define model

In [34]:
from keras.layers import Dense, LSTM, CuDNNLSTM, Embedding
model=keras.models.Sequential()
model.add(Embedding(vocab_size, 200, input_length=seq_length))
#embedding(input_dim,embedding_dim)

model.add(CuDNNLSTM(128, return_sequences=True)) 
#rmb if want to stack LSTM, have to return_sequences=True at the earlier LSTM layer
model.add(CuDNNLSTM(128)) 
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 200)           1206200   
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 50, 128)           168960    
_________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)     (None, 128)               132096    
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 6031)              392015    
Total params: 1,924,039
Trainable params: 1,924,039
Non-trainable params: 0
_________________________________________________________________


In [28]:
#Model compliation
lr=0.001
optimizer= keras.optimizers.Adam(lr=lr)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=['accuracy'])

In [35]:
model.load_weights('./V3/HP_vocab_v6.h5')

In [57]:
#suppress warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
import random
import sys
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint,TensorBoard


callbacks = [
    ReduceLROnPlateau(monitor='acc', factor=0.5, patience=4,
                   min_delta=0.003, mode='max', cooldown=3, verbose=1,min_lr=0.0001),
    ModelCheckpoint(filepath="weights.{epoch:03d}.h5",
                    save_best_only=True, save_weights_only=False ,monitor='acc',
                    mode='max', period=1,verbose=1),
    TensorBoard(log_dir='./Graph/', write_grads=True, write_graph=True, write_images=True,
                update_freq='batch')]

model.fit(x,y,batch_size=256, epochs=60, callbacks=callbacks, verbose=1)

### Generate text w/o temp setting
The first step in generating text is preparing a seed input. Select a random line of text from the input text for this purpose.

In [37]:
from random import randint
seed_text = lines[randint(0,len(lines))]


print(f"{'-'*50} \nseed text is : \n{seed_text} \n")

generated_text=[]
for i in range(50):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]

    
    encoded = pad_sequences([encoded], maxlen=seq_length,truncating='pre')

    yhat = model.predict_classes(encoded, verbose=0)
      
    yhat_value=model.predict(encoded, verbose=0)

    yhat_word=""
    for value,idx in word_index.items():
        if idx==yhat:
            yhat_word=value
            generated_text.append(yhat_word)
            break
            
    seed_text+=" "+yhat_word
join_generated_text=" ".join(generated_text)

print(f" {'-'*50} \ngenerated text is : \n{join_generated_text}")

-------------------------------------------------- 
seed text is : 
finishing the feast in their houses hermione left professor mcgonagall turned to harry and ron well i still say you were lucky but not many first years could have taken on a fullgrown mountain troll you each win gryffindor five points professor dumbledore will be informed of this you may go 

 -------------------------------------------------- 
generated text is : 
they hurried out of the chamber and didnt speak at all until they had climbed two floors up it was a relief to be away from the smell of the troll quite apart from anything else we should have gotten more than ten points ron grumbled five you mean once


### Inference with custom input sentence

In [80]:
custom_input="hi harry my name is patrick nice to meet you how are you "
print(f"{'-'*50} \nseed text is :\n{custom_input}\n")

generated_text=[]
for i in range(50):
    encoded = tokenizer.texts_to_sequences([custom_input])[0]

    
    encoded = pad_sequences([encoded], maxlen=seq_length,truncating='pre')

    yhat = model.predict_classes(encoded, verbose=0)
    
    yhat_word=""
    for value,idx in word_index.items():
        if idx==yhat:
            yhat_word=value
            generated_text.append(yhat_word)
            break
            
    custom_input+=" "+yhat_word
join_generated_text=" ".join(generated_text)
# " ".join(list)

print(f" {'-'*50} \ngenerated text is : \n{join_generated_text}")

-------------------------------------------------- 
seed text is :
hi harry my name is patrick nice to meet you how are you 

 -------------------------------------------------- 
generated text is : 
enough all package potter said harry whats abou green years ago all as you stay here is burst past than anything but not be stupid before you know ill be mad from you and flew in there harry was worth no ghosts harry bowed it goyle held up her jacket


### generate text with temp setting

In [88]:
temperature=2

from random import randint
seed_text = lines[randint(0,len(lines))]

print(f"{'-'*50} \nseed text is : \n{seed_text} \n")

generated_text=[]
for i in range(50):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]

    
    encoded = pad_sequences([encoded], maxlen=seq_length,truncating='pre')

    yhat_value=model.predict(encoded, verbose=0)

    preds=np.asarray(yhat_value).astype('float64')
    preds=np.log(preds)/temperature
    exp_preds=np.exp(preds)
    preds=exp_preds/np.sum(exp_preds)
    probas=np.random.multinomial(1,preds.flatten(),1)


    yhat=np.argmax(probas)

    yhat_word=""
    for value,idx in word_index.items():
        if idx==yhat:
            yhat_word=value
            generated_text.append(yhat_word)
            break

    seed_text+=" "+yhat_word
join_generated_text=" ".join(generated_text)

print(f" {'-'*50} \ntemperature using is {temperature}\ngenerated text is : \n{join_generated_text}")

-------------------------------------------------- 
seed text is : 
very very long day after all to the wellorganized mind death is but the next great adventure you know the stone was really not such a wonderful thing as much money and life as you could want the two things most human beings would choose above all the trouble is humans 

 -------------------------------------------------- 
temperature using is 2
generated text is : 
do have a knack of choosing precisely those things that are worst for them harry lay there lost for words harry said harry even a moment he gets powerful people was quidditch captain noise they took from a pocket muggles chasin to drift hogwarts night once dudley directed a bludger


### Inference with custom input sentence with temp


In [99]:
temperature=1
custom_input="Once upon a time there was a boy called harry potter and he was fat and ugly now he becomes handsome"
print(f"{'-'*50} \nseed text is :\n{custom_input}\n")

generated_text=[]
for i in range(50):
    encoded = tokenizer.texts_to_sequences([custom_input])[0]

    
    encoded = pad_sequences([encoded], maxlen=seq_length,truncating='pre')


    yhat_value=model.predict(encoded, verbose=0)

    preds=np.asarray(yhat_value).astype('float64')
    preds=np.log(preds)/temperature
    exp_preds=np.exp(preds)
    preds=exp_preds/np.sum(exp_preds)
    probas=np.random.multinomial(1,preds.flatten(),1)


    yhat=np.argmax(probas)
      
    yhat_word=""
    for value,idx in word_index.items():
        if idx==yhat:
            yhat_word=value
            generated_text.append(yhat_word)
            break
            
    custom_input+=" "+yhat_word
join_generated_text=" ".join(generated_text)

print(f" {'-'*50} \ntemperature using is {temperature}\ngenerated text is : \n{join_generated_text}")

-------------------------------------------------- 
seed text is :
Once upon a time there was a boy called harry potter and he was fat and ugly now he becomes handsome

 -------------------------------------------------- 
temperature using is 1
generated text is : 
with other unusual of her and magic so harrys ears had a toad that had ever been out of his pocket i told me about the hat is a reason to fight me ill dump me you will brought but jus dont a dragon afternoon tomorrow i got something like


### with teacher forcing in inference

In [42]:
temperature=0.5

from random import randint
import random 
random_no=randint(0,len(lines))
seed_text = lines[random_no]


print(f"{'-'*50} \nseed text is : \n{seed_text} \n")

generated_text=[]
generated_text_w_teach=[]

for i in range(50):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]

    
    encoded = pad_sequences([encoded], maxlen=seq_length,truncating='pre')

    yhat_value=model.predict(encoded, verbose=0)

    preds=np.asarray(yhat_value).astype('float64')
    preds=np.log(preds)/temperature
    exp_preds=np.exp(preds)
    preds=exp_preds/np.sum(exp_preds)
    probas=np.random.multinomial(1,preds.flatten(),1)


    yhat=np.argmax(probas)

    yhat_word=""
    for value,idx in word_index.items():
        if idx==yhat:
            yhat_word=value
            generated_text.append(yhat_word)
            break
    
      
    if random.randrange(1,100)>80:
      model_ans_vocab=lines[random_no+1].split()[-1]
      seed_text_extract = seed_text.split()+ lines[random_no+1].split()[-1:]

      seed_text_teaching= " ".join(seed_text_extract)
      seed_text=seed_text_teaching
      generated_text_w_teach.append(model_ans_vocab)
    else:
      seed_text+=" "+yhat_word
      
    random_no+=1 

      
join_generated_text=" ".join(generated_text)
join_generated_text2=" ".join(generated_text_w_teach)



print(f" {'-'*50} \ntemperature using is {temperature}\ngenerated text is : \n{join_generated_text}")
print(f" {'-'*50} \nteaching vocabs used are: \n{join_generated_text2}")

teach_seed_text=" ".join(seed_text.split()[-50:])
print(f" {'-'*50} \nwith teaching is: \n{teach_seed_text} ")

-------------------------------------------------- 
seed text is : 
for gryffindor i have to fly around our hoops and stop the other team from scoring three chasers one keeper said harry who was determined to remember it all and they play with the quaffle okay got that so what are they for he pointed at the three balls left inside 

 -------------------------------------------------- 
temperature using is 0.5
generated text is : 
the box ill show you now said wood take this he handed harry a small club a bit like a short baseball bat im going to show you what harry and percy was the rest of the black chessmen erupted and a new whisper to harry anything was also dark
 -------------------------------------------------- 
teaching vocabs used are: 
now said take this im bludgers showed jet black
 -------------------------------------------------- 
with teaching is: 
the box ill show you now said wood take this he handed harry a small club a bit like a short baseball bat im going to show yo