In [None]:
import numpy as np
import pandas as pd

# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
'''
downloaded = drive.CreateFile({'id':'1fjM5LTtbHpkeI0CxnuMWWc0vC3_ldhw-'}) 
downloaded.GetContentFile('quora_questions.csv') 

quora= pd.read_csv("quora_questions.csv")

quora.head()

'''

'\ndownloaded = drive.CreateFile({\'id\':\'1fjM5LTtbHpkeI0CxnuMWWc0vC3_ldhw-\'}) \ndownloaded.GetContentFile(\'quora_questions.csv\') \n\nquora= pd.read_csv("quora_questions.csv")\n\nquora.head()\n\n'

## Functions for Processing Text

### Reading in files as a string text

In [None]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [None]:
downloaded = drive.CreateFile({'id':'1g75RTZdRE5rc-5JsOsi6XkJ71H2CPA8P'}) 
downloaded.GetContentFile('melville-moby_dick.txt') 

read_file('melville-moby_dick.txt')



In [None]:
'''
# Below we are displaying the text as a script which is more readable (not like above)
from IPython.display import Markdown, display
display(Markdown('> '+df['review'][0]))

'''

"\n# Below we are displaying the text as a script which is more readable (not like above)\nfrom IPython.display import Markdown, display\ndisplay(Markdown('> '+df['review'][0]))\n\n"

### Tokenize and Clean Text

In [None]:
import spacy

# using spacy only for tokenisation, so disabling other elements
# for faster process
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

# to avoid spacy complaints
# we let spacy know we have many words in mody dick.txt
nlp.max_length = 1198623

In [None]:
def separate_punc(doc_text):
  
  
  # if the token happens to be any of these symbols -> get rid of them
  return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [None]:
downloaded = drive.CreateFile({'id':'1pyxPBSrgkAqCElMjI2RXBxj3ex2AuPd-'}) 
downloaded.GetContentFile('moby_dick_four_chapters.txt') 

d=read_file('moby_dick_four_chapters.txt')
tokens = separate_punc(d)
tokens

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such',
 'an',
 'upper',
 'hand',
 '

In [None]:
len(tokens) # so the number of token for the 4 chapters are 11,338

11338

We want to pass the first 25 words of a sentence, and have our network predict the 26the word

> The idea here is that 25 words are enough to grab the meaning and structure of a sentence (but this is depends on the task and nature of docs)

In [None]:
4431/25

177.24

## Create Sequences of Tokens

In [None]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i] # :i --> up to i
    
    # Add to list of sequences
    text_sequences.append(seq)

In [None]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [None]:
' '.join(text_sequences[1]) # it is one token over (1 token brosta apo tin arxi kai telos)

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [None]:
' '.join(text_sequences[2])

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [None]:
len(text_sequences)

11312

# Keras

## Keras Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

Using TensorFlow backend.


In [None]:
sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24]

In [None]:
sequences[1] # notice, they reflect the way of how we made tokens work more above 

[14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24,
 957]

In [None]:
tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'about',
 38: 'no',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [None]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [None]:
for i in sequences[0]:
    print(f'{i:{20}} {tokenizer.index_word[i]:{20}}')

                 956 call                
                  14 me                  
                 263 ishmael             
                  51 some                
                 261 years               
                 408 ago                 
                  87 never               
                 219 mind                
                 129 how                 
                 111 long                
                 954 precisely           
                 260 having              
                  50 little              
                  43 or                  
                  38 no                  
                 315 money               
                   7 in                  
                  23 my                  
                 546 purse               
                   3 and                 
                 150 nothing             
                 259 particular          
                   6 to                  
                2712 interest     

In [None]:
tokenizer.word_counts # how many times these tokens appear

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [None]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size # number of unique words

2717

### Convert to Numpy Matrix

In [None]:
import numpy as np

sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

# Creating an LSTM based model

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

def create_model(vocabulary_size, seq_len): #seq_len= 25
    model = Sequential()

    #25=seq_len / output will be the seq_len
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))

    # LSTM neurons to train on the sequences
    # number of neurons: not specifi answer
    # but something multiple of sequence length
    # 25* (7)
    # we could say:
    # model.add(LSTM(seq_len*7, return_sequences=True))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    # each vocabulary word treated as its own individual vocabulary = categ cross
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [None]:
from keras.utils import to_categorical

sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [None]:
len(sequences)

11312

In [None]:
# First 49 words
# all rows, all columns apart from the last columns (the 50th words)
sequences[:,:-1]

array([[ 956,   14,  263, ...,    6, 2712,   14],
       [  14,  263,   51, ..., 2712,   14,   24],
       [ 263,   51,  261, ...,   14,   24,  957],
       ...,
       [ 952,   12,  166, ...,   11,  262,   53],
       [  12,  166, 2711, ...,  262,   53,    2],
       [ 166, 2711,    3, ...,   53,    2, 2717]])

In [None]:
# last Word
# grab the last column 
# notice, they are the numbers of the last column of sequences[:,:-1]
sequences[:,-1]

array([  24,  957,    5, ...,    2, 2717,   26])

In [None]:
X = sequences[:,:-1]

In [None]:
y = sequences[:,-1]

In [None]:
# the number of classes is essentially the vocabulary size, and vice versa
y = to_categorical(y, num_classes=vocabulary_size+1)

In [None]:
X.shape
# 11,368 sequences of 25 words each, as we assigned

(11312, 25)

In [None]:
seq_len = X.shape[1]

In [None]:
seq_len

25

### Training the Model

In [None]:
# define model
'''
Because of the way the embedding works with padding for pad sequences
we need to add one more space to essentially hold the 0,
so this is why +1



'''
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [None]:
from pickle import dump,load

# fit model


# batch size= how many sequences you want to pass in at in time

model.fit(X, y, batch_size=128, epochs=5,verbose=1)# save the model to file
model.save('epochBIG.h5')

# dumb -> pickle file
# save the tokenizer
dump(tokenizer, open('my_tokenizer', 'wb'))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Generating New Text

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences


def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed, some text to start off
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    # creating a placeholder
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        


        # Pad sequences to our trained rate (50 words in the video)
        # from keras.preprocessing.sequence import pad_sequences
        '''
This essentially makes mixer's that if you pass in a super long text it's it's really trained on 25
tokens.
We're going to pad it to make sure it's only 25 tokens.
Or if you see text happens to be too short then we're going to pad it to fill up the 25 spaces again
in order to get best results.
---
pre: can either do a kind of pre or post beginning of string or after the string so will go ahead and do

it at the beginning of the string.

        '''
        # so we might need padding:
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [None]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [None]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [None]:
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

'the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the'

### Exploring Generated Sequence

In [None]:
full_text = read_file('moby_dick_four_chapters.txt')

In [None]:
for i,word in enumerate(full_text.split()):
    if word == 'inkling':
        print(' '.join(full_text.split()[i-20:i+20]))
        print('\n')

were stains of some sort or other. At first I knew not what to make of this; but soon an inkling of the truth occurred to me. I remembered a story of a white man--a whaleman too--who, falling among the


