In [1]:
import numpy as np
words = ["artificial", "intelligence", "changes", "everything"]
word_to_ix = {w: i for i, w in enumerate(words)}
ix_to_word = {i: w for w, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

embedding_dim = 10
hidden_dim = 8
learning_rate = 0.01
epochs = 300

# Initialize weights
np.random.seed(1)
Wxh = np.random.randn(hidden_dim, embedding_dim) * 0.01  
Whh = np.random.randn(hidden_dim, hidden_dim) * 0.01      
Why = np.random.randn(vocab_size, hidden_dim) * 0.01      

bh = np.zeros((hidden_dim, 1))  
by = np.zeros((vocab_size, 1)) 


embedding = np.random.randn(vocab_size, embedding_dim) * 0.01


inputs = [word_to_ix[w] for w in ["artificial", "intelligence", "changes"]]
target = word_to_ix["everything"]


for epoch in range(epochs):
    
    hs = {}
    hs[-1] = np.zeros((hidden_dim, 1))  
    
    loss = 0
    
    for t in range(len(inputs)):
        x = embedding[inputs[t]].reshape(-1, 1)  
        hs[t] = np.tanh(np.dot(Wxh, x) + np.dot(Whh, hs[t-1]) + bh)
    
    
    y_pred = np.dot(Why, hs[len(inputs)-1]) + by
    y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred))  
    
   
    loss = -np.log(y_pred[target, 0])
    
    
    dWhy = np.zeros_like(Why)
    dby = np.zeros_like(by)
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dbh = np.zeros_like(bh)
    dembedding = np.zeros_like(embedding)
    
    dh_next = np.zeros_like(hs[0])
    
    dy = y_pred
    dy[target] -= 1
    
    dWhy += np.dot(dy, hs[len(inputs)-1].T)
    dby += dy
    
    for t in reversed(range(len(inputs))):
        dh = np.dot(Why.T, dy) + dh_next
        dh_raw = (1 - hs[t] * hs[t]) * dh
        
        dbh += dh_raw
        dWxh += np.dot(dh_raw, embedding[inputs[t]].reshape(1, -1))
        dWhh += np.dot(dh_raw, hs[t-1].T)
        
        dembedding[inputs[t]] += np.dot(Wxh.T, dh_raw).flatten()
        dh_next = np.dot(Whh.T, dh_raw)
    
    # Clip gradients
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dembedding]:
        np.clip(dparam, -5, 5, out=dparam)
    
    
    Wxh -= learning_rate * dWxh
    Whh -= learning_rate * dWhh
    Why -= learning_rate * dWhy
    bh  -= learning_rate * dbh
    by  -= learning_rate * dby
    embedding -= learning_rate * dembedding

    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

print("\nTesting...")
hs = {}
hs[-1] = np.zeros((hidden_dim, 1))
for t in range(len(inputs)):
    x = embedding[inputs[t]].reshape(-1, 1)
    hs[t] = np.tanh(np.dot(Wxh, x) + np.dot(Whh, hs[t-1]) + bh)
    
y_pred = np.dot(Why, hs[len(inputs)-1]) + by
y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred))

predicted_word = ix_to_word[np.argmax(y_pred)]
print(f"Predicted next word: {predicted_word}")


Epoch 0, Loss: 1.3863
Epoch 50, Loss: 1.0556
Epoch 100, Loss: 0.8086
Epoch 150, Loss: 0.6269
Epoch 200, Loss: 0.4907
Epoch 250, Loss: 0.3843

Testing...
Predicted next word: everything
