In [None]:
import numpy as np
text = ["artificial", "intelligence", "changes", "everything"]

word_to_idx = {word: idx for idx, word in enumerate(text)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

vocab_size = len(word_to_idx)

embedding_dim = 10
hidden_dim = 8
learning_rate = 0.01
epochs = 300

np.random.seed(42)

Wxh = np.random.randn(hidden_dim, embedding_dim) * 0.01
Whh = np.random.randn(hidden_dim, hidden_dim) * 0.01
Why = np.random.randn(vocab_size, hidden_dim) * 0.01

bh = np.zeros((hidden_dim, 1))
by = np.zeros((vocab_size, 1))

embedding = np.random.randn(vocab_size, embedding_dim) * 0.01

inputs = [word_to_idx[word] for word in ["artificial", "intelligence", "changes"]]
target = word_to_idx["everything"]

for epoch in range(epochs):
    hs = {}
    hs[-1] = np.zeros((hidden_dim, 1))

    for t in range(len(inputs)):
        xt = embedding[inputs[t]].reshape(-1, 1)
        Wxh_xt = np.dot(Wxh, xt)
        Whh_ht_minus1 = np.dot(Whh, hs[t-1])
        ht_before_tanh = Wxh_xt + Whh_ht_minus1 + bh
        ht = np.tanh(ht_before_tanh)
        hs[t] = ht

    last_ht = hs[len(inputs)-1]
    y_linear = np.dot(Why, last_ht) + by

    exp_scores = np.exp(y_linear - np.max(y_linear))
    y_pred = exp_scores / np.sum(exp_scores)

    loss = -np.log(y_pred[target, 0])

    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    dembedding = np.zeros_like(embedding)

    dh_next = np.zeros_like(hs[0])

    dy = y_pred
    dy[target] -= 1

    dWhy += np.dot(dy, last_ht.T)
    dby += dy

    for t in reversed(range(len(inputs))):
        dh = np.dot(Why.T, dy) + dh_next
        dh_raw = (1 - hs[t]**2) * dh
        dbh += dh_raw
        dWxh += np.dot(dh_raw, embedding[inputs[t]].reshape(1, -1))
        dWhh += np.dot(dh_raw, hs[t-1].T)
        dembedding[inputs[t]] += np.dot(Wxh.T, dh_raw).flatten()
        dh_next = np.dot(Whh.T, dh_raw)

    for grad in [dWxh, dWhh, dWhy, dbh, dby, dembedding]:
        np.clip(grad, -5, 5, out=grad)

    Wxh -= learning_rate * dWxh
    Whh -= learning_rate * dWhh
    Why -= learning_rate * dWhy
    bh  -= learning_rate * dbh
    by  -= learning_rate * dby
    embedding -= learning_rate * dembedding

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

print("\nTesting prediction...")
hs = {}
hs[-1] = np.zeros((hidden_dim, 1))

for t in range(len(inputs)):
    xt = embedding[inputs[t]].reshape(-1, 1)
    hs[t] = np.tanh(np.dot(Wxh, xt) + np.dot(Whh, hs[t-1]) + bh)

last_ht = hs[len(inputs)-1]
y_linear = np.dot(Why, last_ht) + by
y_pred = np.exp(y_linear - np.max(y_linear))
y_pred = y_pred / np.sum(y_pred)

predicted_idx = np.argmax(y_pred)
predicted_word = idx_to_word[predicted_idx]

print(f"Predicted next word: {predicted_word}")


Epoch 0, Loss: 1.3863
Epoch 50, Loss: 1.0567
Epoch 100, Loss: 0.8119
Epoch 150, Loss: 0.6346
Epoch 200, Loss: 0.5058
Epoch 250, Loss: 0.4103

Testing prediction...
Predicted next word: everything
