<a href="https://colab.research.google.com/github/chcorophyll/my_deeplearning_cookbook/blob/master/my__Sequence_to_sequence_mapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git clone https://github.com/chcorophyll/deep_learning_cookbook.git

Cloning into 'deep_learning_cookbook'...
remote: Enumerating objects: 427, done.[K
remote: Total 427 (delta 0), reused 0 (delta 0), pack-reused 427[K
Receiving objects: 100% (427/427), 160.26 MiB | 33.75 MiB/s, done.
Resolving deltas: 100% (207/207), done.
Checking out files: 100% (86/86), done.


In [0]:
!ls

deep_learning_cookbook	sample_data


In [0]:
import os

path_org = os.getcwd()
data_path = os.path.join(path_org, "deep_learning_cookbook")
os.chdir(data_path)

In [0]:
!sudo apt-get install libdb++-dev
!export BERKELEYDB_DIR=/usr
!pip install gutenberg

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  libdb5.3 libdb5.3++ libdb5.3++-dev libdb5.3-dev
Suggested packages:
  db5.3-doc
The following NEW packages will be installed:
  libdb++-dev libdb5.3++ libdb5.3++-dev libdb5.3-dev
The following packages will be upgraded:
  libdb5.3
1 upgraded, 4 newly installed, 0 to remove and 15 not upgraded.
Need to get 2,918 kB of archives.
After this operation, 8,395 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libdb5.3 amd64 5.3.28-13.1ubuntu1.1 [672 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libdb5.3++ amd64 5.3.28-13.1ubuntu1.1 [703 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libdb5.3-dev amd64 5.3.

In [0]:
!pip install gutenberg



In [0]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

import nltk
from nltk.corpus import wordnet as wn
import inflect

from keras.models import Sequential
from keras import layers
import numpy as np
from collections import Counter, defaultdict

from gensim.utils import tokenize
from itertools import groupby

from keras.models import Input, Model
from keras.layers import Dense, Dropout
from keras.layers import LSTM, RepeatVector
from keras.layers.wrappers import TimeDistributed

Using TensorFlow backend.


In [0]:
p = inflect.engine()

In [0]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
pairs = {}
for synset in wn.all_synsets("n"):
    word = synset.name().split(".", 1)[0]
    if not word in pairs:
        pairs[word] = p.plural(word)
len(pairs)

67176

In [0]:
with open("data/plurals.txt", "w") as fout:
    for k in sorted(pairs):
        if "_" in k or "-" in k:
            continue
        if k.isdigit():
            continue
        fout.write("%s\t%s\n" % (k, pairs[k]))

In [0]:
class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
        
    def encode(self, C, num_rows):
        """One hot encode given string C.
        # Arguments
            num_rows: Number of rows in the returned one hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x
    
    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return "".join(self.indices_char[x] for x in x)
            
    

In [0]:
# Parameters for the model and dataset
INVERT = True

In [0]:
questions = []
expected = []
seen = set()
with open("data/plurals.txt") as fin:
    for line in fin:
        en, de = line.strip().split("\t")
        questions.append(en)
        expected.append(de)
        
max_question_len = max(len(q) for q in questions)
max_expected_len = max(len(e) for e in expected)
questions =[" " * (max_question_len - len(q)) + q 
            for q in questions]
expected = [e + " " * (max_expected_len - len(e)) for e in expected]
if INVERT:
    questions = [q[::-1] for q in questions]
    
print("Total addition questions", len(questions))

Total addition questions 39929


In [0]:
chars = set(ch for k, v in zip(questions, expected) for ch in k+v)
ctable = CharacterTable(chars)
len(chars)

40

In [0]:
print("Vectorization...")
x = np.zeros((len(questions), max_question_len, len(chars)), dtype=np.bool)
y = np.zeros((len(questions), max_expected_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, max_question_len)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, max_expected_len)
print("done")

Vectorization...
done


In [0]:
# Shuffle (x, y) in unison as the later parts of x will almost all be larger
# digits.
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) //10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]

print("Training Data:")
print(x_train.shape)
print(y_train.shape)

print("Validation Data")
print(x_val.shape)
print(y_val.shape)

Training Data:
(35937, 31, 40)
(35937, 32, 40)
Validation Data
(3992, 31, 40)
(3992, 32, 40)


In [0]:
# The below is taken from: https://github.com/keras-team/keras/blob/master/examples/addition_rnn.py
RNN = layers.LSTM
HIDDEN_SIZE = 128
LAYERS = 1

print("Build model...")
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
# Note: In a situation where your input sequences have a variable length,
# use input_shape=(None, num_feature).
model.add(RNN(HIDDEN_SIZE, input_shape=(max_question_len, len(chars))))
# As the decoder RNN's input, repeatedly provide with the last hidden state of
# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
#model.add(layers.Dropout(DROP_OUT))
model.add(layers.RepeatVector(max_expected_len))
# The decoder RNN could be multiple layers stacked or a single layer.
for _ in range(LAYERS):
    # By setting return_sequences to True, return not only the last output but
    # all the outputs so far in the form of (num_samples, timesteps,
    # output_dim). This is necessary as TimeDistributed in the below expects
    # the first dimension to be the timesteps.
    model.add(RNN(HIDDEN_SIZE, return_sequences=True))
#     model.add(layers.Dropout(DROP_OUT))
# Apply a dense layer to the every temporal slice of an input. For each of step
# of the output sequence, decide which character should be chosen.
model.add(layers.TimeDistributed(layers.Dense(len(chars))))
model.add(layers.Activation("softmax"))
model.compile(loss="categorical_crossentropy", 
              optimizer="adam",
              metrics=["accuracy"])
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 128)               86528     
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 32, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 32, 128)           131584    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 32, 40)            5160      
_________________________________________________________________
activation_2 (Activation)    (None, 32, 40)            0         
Total params: 223,272
Trainable params: 223,272
Non-trainable params: 0
_________________________________________________________________


In [0]:
BATCH_SIZE = 2048
# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 200):
    model.fit(x_train, y_train, 
              batch_size=BATCH_SIZE, 
              epochs=10, validation_data=(x_val, y_val))
    print()
    print("-" * 50)
    print("Iteration", iteration)
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print(q[::-1] if INVERT else q, '(%s)' % correct, '-', guess)

W0630 10:09:44.010850 140388729026432 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0630 10:09:45.631118 140388729026432 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 1
                         grotto (grottoes                        ) - aaaaeess                        
                           saul (sauls                           ) - aaaess                          
                      immunogen (immunogens                      ) - aaaaiiieess                     
                        meniere (menieres                        ) - aaaaeees                        
                   transaminase (transaminases                   ) - aaaaiiiiiiess                   
                          shill (shills                          ) - aaaees                          
                    perciformes (perciforme                      ) - aaaaiiiiieess                   
                           airs (air                          

In [0]:
x_val[[0]].shape

(1, 31, 40)

In [0]:
x_val[0].shape

(31, 40)

**Import Gutenberg**

In [0]:
import requests
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from gutenberg.acquire.text import UnknownDownloadUriException
import re
from gensim.utils import tokenize
import random
import nltk
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import os
import glob
import json

In [0]:
with open("data/gutenberg_index.json") as fin:
    authors = json.load(fin)
recent = [x for x in authors if "birthdate" in x and x["birthdate"] > 1830]
[(x["name"], x["birthdate"], x["english_books"]) for x in recent[:5]]

[('Twain, Mark', 1835, 210),
 ('Ebers, Georg', 1837, 164),
 ('Parker, Gilbert', 1862, 135),
 ('Fenn, George Manville', 1831, 128),
 ('Jacobs, W. W. (William Wymark)', 1863, 112)]

In [0]:
recent[0]

{'about': '2009/agents/53',
 'alias': ['Twain, Mark (Samuel Clemens)', 'Clemens, Samuel Langhorne'],
 'birthdate': 1835,
 'books': [['10135', 'The Great English Short-Story Writers, Volume 1', 'en'],
  ['102', "The Tragedy of Pudd'nhead Wilson", 'en'],
  ['1044', "Extract from Captain Stormfield's Visit to Heaven", 'en'],
  ['1086', "A Horse's Tale", 'en'],
  ['10947', 'The Best American Humorous Short Stories', 'en'],
  ['11622', 'Plus fort que Sherlock Holmès', 'fr'],
  ['119', 'A Tramp Abroad', 'en'],
  ['1213', 'The Man That Corrupted Hadleyburg', 'en'],
  ['12711', 'On the Decay of the Art of Lying', 'en'],
  ['142', 'The $30,000 Bequest, and Other Stories', 'en'],
  ['17945', 'Mark Twain: Tri Noveloj', 'eo'],
  ['1837', 'The Prince and the Pauper', 'en'],
  ['18381', 'De Lotgevallen van Tom Sawyer', 'nl'],
  ['1892',
   "Extracts from Adam's Diary, translated from the original ms.",
   'en'],
  ['19484', 'Editorial Wild Oats', 'en'],
  ['19506', "A Connecticut Yankee in King Arth

In [0]:
PARAGRAPH_SPLIT_RE = re.compile(r'\n *\n+')

def extract_conversations(text, quote='"'):
    paragraphs = PARAGRAPH_SPLIT_RE.split(text.strip())
    conversations = [['']]
    for paragraph in paragraphs:
        chunks = paragraph.replace('\n', ' ').split(quote)
        for i in range((len(chunks) + 1) // 2):
            if (len(chunks[i * 2]) > 100 or len(chunks) == 1) and conversations[-1] != ['']:
                if conversations[-1][-1] == '':
                    del conversations[-1][-1]
                conversations.append([''])
            if i * 2 + 1 < len(chunks):
                chunk = chunks[i * 2 + 1]
                if chunk:
                    if conversations[-1][-1]:
                        if chunk[0] >= 'A' and chunk[0] <= 'Z':
                            if conversations[-1][-1].endswith(','):
                                conversations[-1][-1] = conversations[-1][-1][:-1]
                            conversations[-1][-1] += '.'
                        conversations[-1][-1] += ' '
                    conversations[-1][-1] += chunk
        if conversations[-1][-1]:
            conversations[-1].append('')

    return [x for x in conversations if len(x) > 1]


conversations = extract_conversations(strip_headers(load_etext(10008).strip()))
sum(len(x) for x in conversations)       
    

1126

In [0]:
strip_headers(load_etext(10008).strip())



In [0]:
paragraphs = PARAGRAPH_SPLIT_RE.split(strip_headers(load_etext(10008).strip()).strip())
paragraphs

['Elisa Williams, Tonya Allen and PG Distributed Proofreaders',
 'THE MYSTERY',
 'BY',
 'STEWART EDWARD WHITE',
 'AND',
 'SAMUEL HOPKINS ADAMS',
 '_Illustrations by Will Crawford_',
 '1907',
 'CONTENTS',
 'PART ONE',
 'THE SEA RIDDLE',
 'I.      DESERT SEAS',
 'II.     THE "LAUGHING LASS"',
 'III.    THE DEATH SHIP',
 'IV.     THE SECOND PRIZE CREW',
 'V.      THE DISAPPEARANCE',
 'VI.     THE CASTAWAYS',
 'VII.    THE FREE LANCE',
 'PART TWO',
 'THE BRASS BOUND CHEST',
 '_Being the story told by Ralph Slade, Free Lance, to the officers of\nthe United States Cruiser "Wolverine"_',
 'I.      THE BARBARY COAST',
 'II.     THE GRAVEN IMAGE',
 'III.    THE TWELVE REPEATING RIFLES',
 'IV.     THE STEEL CLAW',
 "V.      THE PHILOSOPHER'S STONE",
 'VI.     THE ISLAND',
 'VII.    CAPTAIN SELOVER LOSES HIS NERVE',
 'VIII.   WRECKING OF THE "GOLDEN HORN"',
 'IX.     THE EMPTY BRANDY BOTTLE',
 'X.      CHANGE OF MASTERS',
 'XI.     THE CORROSIVE',
 'XII.    "OLD SCRUBS" COMES ASHORE',
 'XIII.   I

In [0]:
paragraphs[600]

'"No, no! Not at all! It iss simply business of my own."'

In [0]:
chunks = paragraphs[600].replace('\n', ' ').split('"')
chunks

['', 'No, no! Not at all! It iss simply business of my own.', '']

In [0]:
paragraphs[700]

'  "\'Are you a man-o\'-war or a privateer,\' said he.\n    _Blow high, blow low, what care we!_\n  \'Oh, I am a jolly pirate, and I\'m sailing for my fee.\'\n    _Down on the coast of the high Barbare-e-e."_'

In [0]:
chunks = paragraphs[700].replace('\n', ' ').split('"')
chunks

['  ',
 "'Are you a man-o'-war or a privateer,' said he.     _Blow high, blow low, what care we!_   'Oh, I am a jolly pirate, and I'm sailing for my fee.'     _Down on the coast of the high Barbare-e-e.",
 '_']

In [0]:
conversations

[['Looks like a heavy job. These floaters that lie with deck almost awash will stand more hammering than a mud fort.',
  "Wish they'd let us put some six-inch shells into her. I'd like to see what they would do.",
  "Nothing but waste a few hundred dollars of your Uncle Sam's money. It takes placed charges inside and out for that kind of work.",
  "Barnett's the man for her then. He's no economist when it comes to getting results. There she goes!"],
 ['Good clean job, Barnett. She was a tough customer, too.',
  'What was she?',
  'The _Caroline Lemp_, three-masted schooner. Anyone know about her?'],
 ['What does the information bureau of the Seven Seas know about it?',
  "Lost three years ago--spring of 1901--got into ice field off the tip of the Aleutians. Some of the crew froze. Others got ashore. Part of survivors accounted for. Others not. Say they've turned native. Don't know myself.",
  'The Aleutians!. Great Cats! What a drift! How many thousand miles would that be?',
  'Not as 

In [0]:
LATIN_1_CHARS = (
    (u'\xe2\x80\x99', "'"),
    (u'\xc3\xa9', 'e'),
    (u'\xe2\x80\x90', '-'),
    (u'\xe2\x80\x91', '-'),
    (u'\xe2\x80\x92', '-'),
    (u'\xe2\x80\x93', '-'),
    (u'\xe2\x80\x94', '-'),
    (u'\xe2\x80\x94', '-'),
    (u'\xe2\x80\x98', "'"),
    (u'\xe2\x80\x9b', "'"),
    (u'\xe2\x80\x9c', '"'),
    (u'\xe2\x80\x9c', '"'),
    (u'\xe2\x80\x9d', '"'),
    (u'\xe2\x80\x9e', '"'),
    (u'\xe2\x80\x9f', '"'),
    (u'\xe2\x80\xa6', '...'),
    (u'\xe2\x80\xb2', "'"),
    (u'\xe2\x80\xb3', "'"),
    (u'\xe2\x80\xb4', "'"),
    (u'\xe2\x80\xb5', "'"),
    (u'\xe2\x80\xb6', "'"),
    (u'\xe2\x80\xb7', "'"),
    (u'\xe2\x81\xba', "+"),
    (u'\xe2\x81\xbb', "-"),
    (u'\xe2\x81\xbc', "="),
    (u'\xe2\x81\xbd', "("),
    (u'\xe2\x81\xbe', ")")
)

books = 0
for author in recent[:1000]:
    for book in author["books"]:
        books += 1
        try:
            txt = strip_headers(load_etext(int(book[0]))).strip()
        except UnknownDownloadUriException:
            continue
        for ch1, ch2 in LATIN_1_CHARS:
            txt = txt.replace(ch1, ch2)
        conversations += extract_conversations(txt)
        
print(len(conversations), books)            

KeyboardInterrupt: ignored

In [0]:
with open('gutenberg.txt', 'w') as fout:
    for conv in conversations:
        fout.write('\n'.join(conv) + '\n\n')

In [0]:
RE_TOKEN = re.compile('(\w+|\?)', re.UNICODE)
token_counter = Counter()
with open('gutenberg.txt') as fin:
    for line in fin:
        line = line.lower().replace('_', ' ')
        token_counter.update(RE_TOKEN.findall(line))
with open('gutenberg.tok', 'w') as fout:
    for token, count in token_counter.items():
        fout.write('%s\t%d\n' % (token, count))

In [0]:
token_counter['?']

**Subword tokenizing**

In [0]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

import nltk
from nltk.corpus import wordnet as wn

from keras.models import Sequential
from keras import layers
import numpy as np
from collections import Counter, defaultdict

from gensim.utils import tokenize
from itertools import groupby, chain

In [0]:
# Parameters for the model and dataset.
NUM_DIGITS = 3
INVERT = True
# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
QUERY_LEN = NUM_DIGITS + 1 + NUM_DIGITS

In [0]:
questions = []
expected = []
seen = set()
print("Generating data...")
for a in pairs.keys():
    if "-" in a or "-" in a:
        continue
    if a.isdigit():
        continue
    if a in seen:
        continue
    seen.add(a)
    questions.append(a)
    expected.append(pairs[a])
    
max_question_len = max(len(q) for q in questions)
max_expected_len = max(len(e) for e in expected)
questions = [' ' * (max_question_len - len(q)) + q for q in questions]
expected = [e + ' ' * (max_expected_len - len(e)) for e in expected]
if INVERT:
    questions = [q[::-1] for q in questions]

print('Total addition questions:', len(questions))

Generating data...
Total addition questions: 65191


In [0]:
shakespeare = strip_headers(load_etext(100))
tokens = [tuple(word) for word in tokenize(shakespeare, to_lower=True)]
token_counts = Counter(tokens)
len(token_counts), len(tokens)

(26016, 989584)

In [0]:
tokens

[('t', 'h', 'e'),
 ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e'),
 ('w', 'o', 'r', 'k', 's'),
 ('o', 'f'),
 ('w', 'i', 'l', 'l', 'i', 'a', 'm'),
 ('s', 'h', 'a', 'k', 'e', 's', 'p', 'e', 'a', 'r', 'e'),
 ('b', 'y'),
 ('w', 'i', 'l', 'l', 'i', 'a', 'm'),
 ('s', 'h', 'a', 'k', 'e', 's', 'p', 'e', 'a', 'r', 'e'),
 ('c', 'o', 'n', 't', 'e', 'n', 't', 's'),
 ('t', 'h', 'e'),
 ('s', 'o', 'n', 'n', 'e', 't', 's'),
 ('a', 'l', 'l'),
 ('s',),
 ('w', 'e', 'l', 'l'),
 ('t', 'h', 'a', 't'),
 ('e', 'n', 'd', 's'),
 ('w', 'e', 'l', 'l'),
 ('t', 'h', 'e'),
 ('t', 'r', 'a', 'g', 'e', 'd', 'y'),
 ('o', 'f'),
 ('a', 'n', 't', 'o', 'n', 'y'),
 ('a', 'n', 'd'),
 ('c', 'l', 'e', 'o', 'p', 'a', 't', 'r', 'a'),
 ('a', 's'),
 ('y', 'o', 'u'),
 ('l', 'i', 'k', 'e'),
 ('i', 't'),
 ('t', 'h', 'e'),
 ('c', 'o', 'm', 'e', 'd', 'y'),
 ('o', 'f'),
 ('e', 'r', 'r', 'o', 'r', 's'),
 ('t', 'h', 'e'),
 ('t', 'r', 'a', 'g', 'e', 'd', 'y'),
 ('o', 'f'),
 ('c', 'o', 'r', 'i', 'o', 'l', 'a', 'n', 'u', 's'),
 ('c', 'y', 'm', 'b',

In [0]:
token_counts

Counter({('t', 'h', 'e'): 29956,
         ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e'): 21,
         ('w', 'o', 'r', 'k', 's'): 29,
         ('o', 'f'): 18712,
         ('w', 'i', 'l', 'l', 'i', 'a', 'm'): 92,
         ('s', 'h', 'a', 'k', 'e', 's', 'p', 'e', 'a', 'r', 'e'): 5,
         ('b', 'y'): 4112,
         ('c', 'o', 'n', 't', 'e', 'n', 't', 's'): 47,
         ('s', 'o', 'n', 'n', 'e', 't', 's'): 6,
         ('a', 'l', 'l'): 4281,
         ('s',): 8390,
         ('w', 'e', 'l', 'l'): 2392,
         ('t', 'h', 'a', 't'): 12241,
         ('e', 'n', 'd', 's'): 55,
         ('t', 'r', 'a', 'g', 'e', 'd', 'y'): 26,
         ('a', 'n', 't', 'o', 'n', 'y'): 522,
         ('a', 'n', 'd'): 28384,
         ('c', 'l', 'e', 'o', 'p', 'a', 't', 'r', 'a'): 281,
         ('a', 's'): 6190,
         ('y', 'o', 'u'): 14608,
         ('l', 'i', 'k', 'e'): 2025,
         ('i', 't'): 8222,
         ('c', 'o', 'm', 'e', 'd', 'y'): 12,
         ('e', 'r', 'r', 'o', 'r', 's'): 14,
         ('c', 'o', 'r', 

In [0]:
pairs = [(token[i], token[i + 1], token) for token in token_counts for i in range(len(token) - 1)]
len(pairs)

157805

In [0]:
pairs

[('t', 'h', ('t', 'h', 'e')),
 ('h', 'e', ('t', 'h', 'e')),
 ('c', 'o', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('o', 'm', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('m', 'p', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('p', 'l', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('l', 'e', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('e', 't', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('t', 'e', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('w', 'o', ('w', 'o', 'r', 'k', 's')),
 ('o', 'r', ('w', 'o', 'r', 'k', 's')),
 ('r', 'k', ('w', 'o', 'r', 'k', 's')),
 ('k', 's', ('w', 'o', 'r', 'k', 's')),
 ('o', 'f', ('o', 'f')),
 ('w', 'i', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('i', 'l', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('l', 'l', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('l', 'i', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('i', 'a', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('a', 'm', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('s', 'h', ('s', 'h', 'a', 'k', 'e', 's', 'p', 'e', 'a', 'r', 'e')),


In [0]:
list(groupby(sorted(pairs), key=lambda x:x[:2]))

[(('_', '_'), <itertools._grouper at 0x7f6ae8083828>),
 (('_', 'a'), <itertools._grouper at 0x7f6ae8083f98>),
 (('_', 'b'), <itertools._grouper at 0x7f6ae7f962e8>),
 (('_', 'c'), <itertools._grouper at 0x7f6ae7f966a0>),
 (('_', 'd'), <itertools._grouper at 0x7f6ae7f96cf8>),
 (('_', 'e'), <itertools._grouper at 0x7f6ae8172ac8>),
 (('_', 'f'), <itertools._grouper at 0x7f6ae8172550>),
 (('_', 'g'), <itertools._grouper at 0x7f6ae81725f8>),
 (('_', 'h'), <itertools._grouper at 0x7f6ae8172710>),
 (('_', 'i'), <itertools._grouper at 0x7f6ae81726a0>),
 (('_', 'j'), <itertools._grouper at 0x7f6ae8172f28>),
 (('_', 'k'), <itertools._grouper at 0x7f6ae8172080>),
 (('_', 'l'), <itertools._grouper at 0x7f6ae8172588>),
 (('_', 'm'), <itertools._grouper at 0x7f6ae809ceb8>),
 (('_', 'n'), <itertools._grouper at 0x7f6ae809c4a8>),
 (('_', 'o'), <itertools._grouper at 0x7f6ae809cfd0>),
 (('_', 'p'), <itertools._grouper at 0x7f6ae809c438>),
 (('_', 'q'), <itertools._grouper at 0x7f6ae809ca90>),
 (('_', 'r

In [0]:
pairs = [(pair, [x[-1] for x in token_ids]) for pair, token_ids in groupby(sorted(pairs), key=lambda x:x[:2])]

In [0]:
for pair, token_ids in groupby(sorted(pairs), key=lambda x:x[:2]):
    if pair == ('g', 'u'):
#         print("0"*50)
#         print(token_ids)
#         print("0"*50)
        for x in token_ids:
#             print(x)
            print(x[-1])
            print()


('g', 'u', ('_', 'g', 'u', 'a', 'r', 'd', 's'))
('_', 'g', 'u', 'a', 'r', 'd', 's')
('g', 'u', ('_', 'g', 'u', 'n', 's'))
('_', 'g', 'u', 'n', 's')
('g', 'u', ('a', 'g', 'u', 'e'))
('a', 'g', 'u', 'e')
('g', 'u', ('a', 'g', 'u', 'e', 'c', 'h', 'e', 'e', 'k'))
('a', 'g', 'u', 'e', 'c', 'h', 'e', 'e', 'k')
('g', 'u', ('a', 'g', 'u', 'e', 'd'))
('a', 'g', 'u', 'e', 'd')
('g', 'u', ('a', 'g', 'u', 'e', 'f', 'a', 'c', 'e'))
('a', 'g', 'u', 'e', 'f', 'a', 'c', 'e')
('g', 'u', ('a', 'g', 'u', 'e', 's'))
('a', 'g', 'u', 'e', 's')
('g', 'u', ('a', 'm', 'b', 'i', 'g', 'u', 'i', 't', 'i', 'e', 's'))
('a', 'm', 'b', 'i', 'g', 'u', 'i', 't', 'i', 'e', 's')
('g', 'u', ('a', 'm', 'b', 'i', 'g', 'u', 'o', 'u', 's'))
('a', 'm', 'b', 'i', 'g', 'u', 'o', 'u', 's')
('g', 'u', ('a', 'n', 'g', 'u', 'i', 's', 'h'))
('a', 'n', 'g', 'u', 'i', 's', 'h')
('g', 'u', ('a', 'n', 'g', 'u', 's'))
('a', 'n', 'g', 'u', 's')
('g', 'u', ('a', 'r', 'g', 'u'))
('a', 'r', 'g', 'u')
('g', 'u', ('a', 'r', 'g', 'u', 'e'))
('a'

In [0]:
pairs

[('t', 'h', ('t', 'h', 'e')),
 ('h', 'e', ('t', 'h', 'e')),
 ('c', 'o', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('o', 'm', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('m', 'p', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('p', 'l', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('l', 'e', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('e', 't', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('t', 'e', ('c', 'o', 'm', 'p', 'l', 'e', 't', 'e')),
 ('w', 'o', ('w', 'o', 'r', 'k', 's')),
 ('o', 'r', ('w', 'o', 'r', 'k', 's')),
 ('r', 'k', ('w', 'o', 'r', 'k', 's')),
 ('k', 's', ('w', 'o', 'r', 'k', 's')),
 ('o', 'f', ('o', 'f')),
 ('w', 'i', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('i', 'l', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('l', 'l', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('l', 'i', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('i', 'a', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('a', 'm', ('w', 'i', 'l', 'l', 'i', 'a', 'm')),
 ('s', 'h', ('s', 'h', 'a', 'k', 'e', 's', 'p', 'e', 'a', 'r', 'e')),


In [0]:
pairs[600]

('u', 'b', ('s', 'u', 'b', 's', 't', 'a', 'n', 't', 'i', 'a', 'l'))

In [0]:
pairs = [(pair, sum(token_counts[x] for x in lst), lst) for pair, lst in pairs]

ValueError: ignored

In [0]:
token_count = Counter(chain(*(p[0] for p in pairs)))

In [0]:
m = max(pairs, key=lambda p:p[1])
m[:2]

In [0]:
tokens = list(tokenize(shakespeare, to_lower=True))

In [0]:

idx_to_token = list(set(tokens))
token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)}
one_hot = lambda token: [1 if i == token_to_idx[token] else 0 
                         for i in range(len(idx_to_token))]
encoded = np.asarray([one_hot(token) for token in tokens])