In [1]:
import time

import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

#Counts number of repetitions of elements in a list
from collections import Counter

#Imports utils. A customed module. 

#### Download the data from url as zip and store it and load it. If it is already downloaded then just load it. 

In [2]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile

dataset_folder_path = 'data'
dataset_filename = 'text8.zip'
dataset_name = 'Text8 Dataset'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(dataset_filename):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
        urlretrieve(
            'http://mattmahoney.net/dc/text8.zip',
            dataset_filename,
            pbar.hook)

if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)
        
with open('data/text8') as f:
    text = f.read()

In [3]:
def clean_text(text_string, min_num_appearances = 5):
    # Replace punctuation with tokens so we can use them in our model
    text = text_string.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    
    # Remove all words with  5 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in tqdm(words) if word_counts[word] > min_num_appearances]

    return trimmed_words

In [4]:
def fit_tokenizer(cleaned_text_list):
    #This function is just to eliminate the training of the Tokenizer object. Big drawbacks
    #we cannot change hyperparams.

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(cleaned_text_list)

    return tokenizer

In [5]:
def single_word_pair(text_list, index, window_size = 5):
    
    num_words = len(text_list)

    rand_window = np.random.randint(1, window_size +1)

    start_index = (index - rand_window) if (index - rand_window) > 0 else 0 

    end_index = (index + rand_window) if (index + rand_window) < num_words else num_words - 1

    word_list = set(text_list[start_index : index] + text_list[index + 1 : end_index])
    
    pairs = [(text_list[index], word) for word in word_list]

    return pairs

The following function would be better as a generator to help save space. Further adjustments. 

In [6]:
def word_pair(text_list, window_size = 5):

    word_pairs = []

    for word_index in tqdm(range(len(text_list))):

        word_pairs += single_word_pair(text_list, word_index, window_size = window_size)

    word_pairs = np.array(word_pairs)

    return word_pairs

#### Cleaning text

In [7]:
cleaned_text = clean_text(text)

100%|██████████| 17005207/17005207 [00:17<00:00, 985817.44it/s] 


#### Obtain word pairs

In [8]:
word_pairs = word_pair(cleaned_text, window_size = 5)

100%|██████████| 16680599/16680599 [04:25<00:00, 62827.94it/s]


In [13]:
display(word_pairs[:10,:])

array([['anarchism', 'originated'],
       ['originated', 'anarchism'],
       ['as', 'originated'],
       ['as', 'of'],
       ['as', 'term'],
       ['as', 'anarchism'],
       ['as', 'a'],
       ['as', 'abuse'],
       ['a', 'originated'],
       ['a', 'term']], dtype='<U29')

In [10]:
X = word_pairs[:, 0]

y = word_pairs[:, 1]

## Subsampling
Before the word tokenization each word will be discarted with a certian probability. This process is described in the second Mikolov article. 

In [21]:
fitted_tokenizer = fit_tokenizer(cleaned_text)

In [22]:
display(fitted_tokenizer.word_index)

{'the': 1,
 'of': 2,
 'and': 3,
 'one': 4,
 'in': 5,
 'a': 6,
 'to': 7,
 'zero': 8,
 'nine': 9,
 'two': 10,
 'is': 11,
 'as': 12,
 'eight': 13,
 'for': 14,
 's': 15,
 'five': 16,
 'three': 17,
 'was': 18,
 'by': 19,
 'that': 20,
 'four': 21,
 'six': 22,
 'seven': 23,
 'with': 24,
 'on': 25,
 'are': 26,
 'it': 27,
 'from': 28,
 'or': 29,
 'his': 30,
 'an': 31,
 'be': 32,
 'this': 33,
 'which': 34,
 'at': 35,
 'he': 36,
 'also': 37,
 'not': 38,
 'have': 39,
 'were': 40,
 'has': 41,
 'but': 42,
 'other': 43,
 'their': 44,
 'its': 45,
 'first': 46,
 'they': 47,
 'some': 48,
 'had': 49,
 'all': 50,
 'more': 51,
 'most': 52,
 'can': 53,
 'been': 54,
 'such': 55,
 'many': 56,
 'who': 57,
 'new': 58,
 'used': 59,
 'there': 60,
 'after': 61,
 'when': 62,
 'into': 63,
 'american': 64,
 'time': 65,
 'these': 66,
 'only': 67,
 'see': 68,
 'may': 69,
 'than': 70,
 'world': 71,
 'i': 72,
 'b': 73,
 'would': 74,
 'd': 75,
 'no': 76,
 'however': 77,
 'between': 78,
 'about': 79,
 'over': 80,
 'years':

In [26]:
display(len(fitted_tokenizer.word_index))

63641

In [9]:
text_list = text.split()

In [29]:
display(cleaned_text[0:50])

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the']