In [12]:
# Question 1
import numpy as np

# Recurrence: h_t = tanh(0.8 * h_{t-1} + x_t), h_0 = 0
# Input sequence: x = [1.0, 0.5, -0.25, 0.0]

x = np.array([1.0, 0.5, -0.25, 0.0])
h = 0.0  # h_0 = 0
hs = []

print("Computing recurrence step by step:")
print(f"h_0 = {h}")

for t, xt in enumerate(x, 1):
    h_prev = h
    h = np.tanh(0.8 * h + xt)
    hs.append(h)
    print(f"h_{t} = tanh(0.8 * {h_prev:.4f} + {xt}) = tanh({0.8 * h_prev + xt:.4f}) = {h:.4f}")

print(f"\nFinal results: h_t for t = 1, ..., 4:")
print("hs =", np.round(hs, 4))

# This is an unrolled graph because we expand the single recurrent operation across multiple time steps turning the temporal loop into a sequence of computations.

Computing recurrence step by step:
h_0 = 0.0
h_1 = tanh(0.8 * 0.0000 + 1.0) = tanh(1.0000) = 0.7616
h_2 = tanh(0.8 * 0.7616 + 0.5) = tanh(1.1093) = 0.8038
h_3 = tanh(0.8 * 0.8038 + -0.25) = tanh(0.3930) = 0.3740
h_4 = tanh(0.8 * 0.3740 + 0.0) = tanh(0.2992) = 0.2906

Final results: h_t for t = 1, ..., 4:
hs = [0.7616 0.8038 0.374  0.2906]


In [13]:
#Question 2
def rnn_step(h_prev, x_t, p):
    # compute a_t
    a_t = p['b'] + np.dot(p['W'], h_prev) + np.dot(p['U'], x_t)
    # compute h_t
    h_t = np.tanh(a_t)
    # compute o_t
    o_t = p['c'] + np.dot(p['V'], h_t)
    # apply softmax
    o_t_shifted = o_t - np.max(o_t)
    y_t = np.exp(o_t_shifted) / np.sum(np.exp(o_t_shifted))
    return h_t, y_t

# Parameters
p = {'b': np.array([1, 0]), 'W': np.array([[1, 1], [0, 1]]), 
     'U': np.array([[1, 1], [-1, 1]]), 'V': np.array([[1, -1], [-1, 1]]), 
     'c': np.array([0, 0])}

# Run 3 steps
h = np.array([0.0, 0.0])
x = np.array([0.0, 0.0])

for t in range(1, 4):
    h, y = rnn_step(h, x, p)
    if t == 1:
        a_1 = p['b'] + np.dot(p['W'], np.array([0.0, 0.0])) + np.dot(p['U'], x)
        h_1 = np.tanh(a_1)
        o_1 = p['c'] + np.dot(p['V'], h_1)
        print(f"t=1: a_t={a_1}, h_t={h_1}, o_t={o_1}")
        print(f"t=1: y_t={y}, sum={np.sum(y):.6f}")
    else:
        print(f"t={t}: y_t={y}")

t=1: a_t=[1. 0.], h_t=[0.76159416 0.        ], o_t=[ 0.76159416 -0.76159416]
t=1: y_t=[0.8210075 0.1789925], sum=1.000000
t=2: y_t=[0.86822576 0.13177424]
t=3: y_t=[0.87208178 0.12791822]


In [14]:
#Question 3
def clip_grad(g, v):
    norm_g = np.linalg.norm(g)
    if norm_g == 0 or norm_g <= v:
        return g
    return v * g / norm_g

# Test 1: g = [3, 4], v = 2.0
g1 = np.array([3.0, 4.0])
v1 = 2.0
g1_clipped = clip_grad(g1, v1)
print(f"g = {g1}, norm = {np.linalg.norm(g1):.4f}")
print(f"clipped = {g1_clipped}, norm = {np.linalg.norm(g1_clipped):.4f}")

# Test 2: Small vector
g2 = np.array([0.1, 0.2])
g2_clipped = clip_grad(g2, v1)
print(f"g = {g2}, norm = {np.linalg.norm(g2):.4f}")
print(f"clipped = {g2_clipped}, norm = {np.linalg.norm(g2_clipped):.4f}")

g = [3. 4.], norm = 5.0000
clipped = [1.2 1.6], norm = 2.0000
g = [0.1 0.2], norm = 0.2236
clipped = [0.1 0.2], norm = 0.2236


In [15]:
#Question 4
# Forget factor experiment
# s_t = f * s_{t-1} + (1-f) * 1, s_0 = 0

forget_factors = [0.2, 0.8, 0.95]

print("Task : Forget Factor Experiment")
print("Final values s_20:")

for f in forget_factors:
    s = 0.0  # s_0 = 0
    for t in range(1, 21):  # t = 1 to 20
        s = f * s + (1 - f) * 1
    print(f"f = {f}: s_20 = {s:.6f}")
    #interpretation: f = 0.95: s_20 = 0.641514 retains information best because of its high forget factor which preserves more of the previous state, hence the key principle behind LSTM forget gates.
    

Task : Forget Factor Experiment
Final values s_20:
f = 0.2: s_20 = 1.000000
f = 0.8: s_20 = 0.988471
f = 0.95: s_20 = 0.641514
