In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
path = '/content/drive/My Drive/war.txt'
text = open(path, 'r' , errors='ignore').read()
text = text.lower()

In [14]:
# Length of extracted character sequences
maxlen = 150

# We sample a new sequence every `step` characters
step = 2

# This holds our extracted sequences
sentences = []

# This holds the targets (the follow-up characters)
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

# List of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))




Number of sequences: 162068
Unique characters: 46


In [15]:
len(text)

324286

In [17]:
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)

In [18]:
# Next, one-hot encode the characters into binary arrays.
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [23]:
from keras.layers import Dense , LSTM ,Dropout
from keras.models import Sequential
import keras

In [21]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)) , return_sequences = True))
model.add(Dropout (0.1))
model.add(LSTM(128))
model.add(Dense(len(chars), activation='softmax'))

In [24]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [25]:
# For randomly selecting unique characters
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [26]:
import random
import sys

for epoch in range(1, 20):
    print('epoch', epoch)
    # Fit the model for 1 epoch on the available training data
    model.fit(X, y,
              batch_size=128,
              epochs=1)

    # Select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    actual_text = text[start_index: start_index + maxlen]
    print('--- Generating with seed: "' + actual_text + '"')
    print('--------------\n')

    for temperature in [0.2,0.5,1]:
        generated_text = actual_text 
        # We generate 100 characters
        for i in range(100):
            sampled = np.zeros((1, maxlen, len(chars)))
            
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            #sys.stdout.write(next_char)
            sys.stdout.flush()
        print('------ temperature:' + str(temperature) + ' -> ' )
        sys.stdout.write(actual_text)
        sys.stdout.write(generated_text)
        print()
        print('\n')

epoch 1
--- Generating with seed: "rough the clearness of the planets atmosphere and obscured
its more familiar features.

even the daily papers woke up to the disturbances at last, and"
--------------

------ temperature:0.2 -> 
rough the clearness of the planets atmosphere and obscured
its more familiar features.

even the daily papers woke up to the disturbances at last, andly papers woke up to the disturbances at last, and the stepled of the steple the saided and the steple of the steped and the stater and the stepped an


------ temperature:0.5 -> 
rough the clearness of the planets atmosphere and obscured
its more familiar features.

even the daily papers woke up to the disturbances at last, andly papers woke up to the disturbances at last, and the road as martian. i shoped me pane the seen a his have have stod of the pily the black of the sa


------ temperature:1 -> 
rough the clearness of the planets atmosphere and obscured
its more familiar features.

even the daily papers wo

  after removing the cwd from sys.path.


------ temperature:0.2 -> 
ab struck his
shoulder and sent him reeling. he gave a shriek and dodged back, and a
cartwheel shaved him narrowly.

way! cried the men all about him.d him narrowly.

way! cried the men all about him. i had been the heat-ray were began the scarcely a man were heaving of the strange had been had been


------ temperature:0.5 -> 
ab struck his
shoulder and sent him reeling. he gave a shriek and dodged back, and a
cartwheel shaved him narrowly.

way! cried the men all about him.d him narrowly.

way! cried the men all about him. and the
whispered and spiced by the face of the strange busin of
horses what was he saw a soon had 


------ temperature:1 -> 
ab struck his
shoulder and sent him reeling. he gave a shriek and dodged back, and a
cartwheel shaved him narrowly.

way! cried the men all about him.d him narrowly.

way! cried the men all about him. was
i two barning of human what we must for the grouss hand, movement from
oursterrdies horrable as


epoch 11
---

## Take aways

* You can see that generated text is not good but the model has learned something new !
* You can improve by training a bigger model, longer, on more data, you can achieve generated samples that will look much more coherent and realistic than ours.
*  But of course, don't expect to ever generate any meaningful text, other than by random chance: all we are doing is sampling data from a statistical model of which characters come after which characters.
* We can  also try to build a model to perdict next token ,given previous tokens. This is called as "language model". 
