In [1]:
from datetime import date

print(date.today())

2020-06-22


### Alice in Wonderland NLP

In [2]:
import os 

os.listdir('data/')

['glove.840B.300d.txt', 'test.zip', 'train.zip', 'wonderland.txt']

In [32]:
file = 'data/wonderland.txt'

raw_txt = open(file=file, encoding='utf-8').read()
raw_txt = raw_txt.lower()
#raw_txt

In [33]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_txt)))
char_to_int = dict((c,i) for i,c in enumerate(chars))

#chars_to_int

In [34]:
n_chars = len(raw_txt)
n_vocab = len(chars)

print ('Total Characters: ', n_chars)
print ('Total Vocabs: ', n_vocab)

Total Characters:  163780
Total Vocabs:  58


In [35]:
seq_length = 100 

x = []
y = []

for i in range(0,(n_chars - seq_length),1):
    seq_in = raw_txt[i:i+seq_length]
    seq_out = raw_txt[i+seq_length]
    
    x.append([char_to_int[ch] for ch in seq_in])
    y.append(char_to_int[seq_out])
    
n_patterns = len(x)
print('Total Patterns: ', n_patterns)

Total Patterns:  163680


In [36]:
# rescale the integers to the range 0-to-1 to make the patterns easier to learn by the LSTM network that uses the sigmoid activation function by default.
import numpy as np 
from keras.utils.np_utils import to_categorical

# reshape X to be [samples, time steps, features]
X = np.reshape(x, (n_patterns, seq_length, 1))

# normalizing
X = X / float(n_vocab)

# one hot encode the output variable
y = to_categorical(y)

import warnings
warnings.filterwarnings('ignore')

In [37]:
# a single hidden LSTM layer with 256 memory units
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ModelCheckpoint

model = Sequential([
    LSTM(256, input_shape= (X.shape[1], X.shape[2])), 
    Dropout(0.2), 
    Dense(y.shape[1], activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy')

In [38]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 58)                14906     
Total params: 279,098
Trainable params: 279,098
Non-trainable params: 0
_________________________________________________________________


In [39]:
filepath = 'weights_improvement_{epoch: 02d}_{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(filepath = filepath, 
                            monitor = 'loss', 
                            verbose =1, 
                            save_best_only=True, 
                            mode = 'min')

In [40]:
model.fit(X, y, 
         epochs=20, 
         batch_size= 128, 
         callbacks= [checkpoint])

Epoch 1/20

Epoch 00001: loss improved from inf to 2.98345, saving model to weights_improvement_ 1_2.9834.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.98345 to 2.80381, saving model to weights_improvement_ 2_2.8038.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.80381 to 2.72547, saving model to weights_improvement_ 3_2.7255.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.72547 to 2.66079, saving model to weights_improvement_ 4_2.6608.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.66079 to 2.60917, saving model to weights_improvement_ 5_2.6092.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.60917 to 2.56193, saving model to weights_improvement_ 6_2.5619.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.56193 to 2.51382, saving model to weights_improvement_ 7_2.5138.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.51382 to 2.46874, saving model to weights_improvement_ 8_2.4687.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.46874 to 2.42947, saving model to weig

<keras.callbacks.callbacks.History at 0x1fb319e39b0>

In [47]:
os.listdir()

['.ipynb_checkpoints',
 'applying nlp basics.ipynb',
 'data',
 'day1.ipynb',
 'day2.ipynb',
 'day3.ipynb',
 'day4.ipynb',
 'GloVe  Global Vectors for Word Representation.html',
 'nlp - basic',
 'NLP overall review.ipynb',
 'Untitled.ipynb',
 'weights_improvement_ 10_2.3885.hdf5',
 'weights_improvement_ 11_2.3526.hdf5',
 'weights_improvement_ 12_2.3182.hdf5',
 'weights_improvement_ 13_2.2854.hdf5',
 'weights_improvement_ 14_2.2554.hdf5',
 'weights_improvement_ 15_2.2303.hdf5',
 'weights_improvement_ 16_2.2025.hdf5',
 'weights_improvement_ 17_2.1716.hdf5',
 'weights_improvement_ 18_2.1459.hdf5',
 'weights_improvement_ 19_2.1235.hdf5',
 'weights_improvement_ 1_2.9834.hdf5',
 'weights_improvement_ 20_2.0994.hdf5',
 'weights_improvement_ 2_2.8038.hdf5',
 'weights_improvement_ 3_2.7255.hdf5',
 'weights_improvement_ 4_2.6608.hdf5',
 'weights_improvement_ 5_2.6092.hdf5',
 'weights_improvement_ 6_2.5619.hdf5',
 'weights_improvement_ 7_2.5138.hdf5',
 'weights_improvement_ 8_2.4687.hdf5',
 'weigh

    *Remove all punctuation from the source text, and therefore from the models’ vocabulary.
    *Try a one hot encoded for the input sequences.
    *Train the model on padded sentences rather than random sequences of characters.
    *Increase the number of training epochs to 100 or many hundreds.
    *Add dropout to the visible input layer and consider tuning the dropout percentage.
    *Tune the batch size, try a batch size of 1 as a (very slow) baseline and larger sizes from there.
    *Add more memory units to the layers and/or more layers.