In [None]:
text = """Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks."""

In [None]:
import re


def tokenize(text):
    pattern = re.compile(r"[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*")
    return pattern.findall(text.lower())

In [None]:
tokens = tokenize(text)
tokens

In [None]:
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

In [10]:
word_to_id, id_to_word = mapping(tokens)
word_to_id

{'email': 0,
 'sample': 1,
 'needed': 2,
 'mathematical': 3,
 'variety': 4,
 'improve': 5,
 'computer': 6,
 'algorithms': 7,
 'build': 8,
 'subset': 9,
 'order': 10,
 'make': 11,
 'applications': 12,
 'tasks': 13,
 'in': 14,
 'filtering': 15,
 'based': 16,
 'to': 17,
 'do': 18,
 'experience': 19,
 'training': 20,
 'it': 21,
 'where': 22,
 'the': 23,
 'seen': 24,
 'without': 25,
 'wide': 26,
 'is': 27,
 'explicitly': 28,
 'decisions': 29,
 'predictions': 30,
 'vision': 31,
 'intelligence': 32,
 'machine': 33,
 'through': 34,
 'are': 35,
 'infeasible': 36,
 'as': 37,
 'of': 38,
 'data': 39,
 'on': 40,
 'known': 41,
 'learning': 42,
 'or': 43,
 'so': 44,
 'difficult': 45,
 'programmed': 46,
 'artificial': 47,
 'study': 48,
 'conventional': 49,
 'a': 50,
 'used': 51,
 'such': 52,
 'develop': 53,
 'model': 54,
 'and': 55,
 'that': 56,
 'automatically': 57,
 'perform': 58,
 'being': 59}

In [None]:
import numpy as np

np.random.seed(42)


def concat(*iterables):
    for iterable in iterables:
        yield from iterable


def one_hot_encode(id, vocab_size):
    res = [0] * vocab_size
    res[id] = 1
    return res


def generate_training_data(tokens, word_to_id, window):
    X = []
    y = []
    n_tokens = len(tokens)

    for i in range(n_tokens):
        idx = concat(
            range(max(0, i - window), i), range(i, min(n_tokens, i + window + 1))
        )
        for j in idx:
            if i == j:
                continue
            X.append(one_hot_encode(word_to_id[tokens[i]], len(word_to_id)))
            y.append(one_hot_encode(word_to_id[tokens[j]], len(word_to_id)))

    return np.asarray(X), np.asarray(y)

In [9]:
X, y = generate_training_data(tokens, word_to_id, 2)
print(X.shape, y.shape)
print(len(word_to_id))

(330, 60) (330, 60)
60
