# DAEN 429 Homework 7
Fletcher Newman | fletcht13@tmau.edu | November 4th, 2025

In [1]:
import torch
import numpy as np

# RNN

In [2]:
def rnn(sequence):
    """
    input is a list of R^4 vectors
    Represents different tokens being put into the model
    Input dim: 4
    """
    # Weights are not being trained so we hard code them
    W = torch.tensor([[1, 0, -1, 1, 1, 0, 1, 1],
                           [0, 2, 1, 0, 0, 1, 1, -1],
                           [1, 2, 1, 0, 0, 1, 0, -1]], 
                           dtype=torch.float32)

    W_y = torch.tensor([3, 0, -1, 3], dtype=torch.float32)

    # Init hidden state
    H = torch.tensor([0, 0, 0, 1], dtype=torch.float32)

    # Init output 
    output = []

    # Loop through sequence
    for x in sequence:
        # Concat vecs to do full matrix mult
        full_vec = torch.cat([x, H], dim=0)

        # Get feature value
        F = torch.matmul(W, full_vec)

        # Update H
        H = torch.tanh(F)
        # Add bias 1
        H = torch.cat([H, torch.tensor([1.0], dtype=H.dtype)], dim=0)

        # Multiply weights for y
        O = torch.matmul(W_y, H)
        # Pass through sigmoid
        y = torch.sigmoid(O)

        # Add to output
        output.append(y)

    return output


In [3]:
# Ex sequence
x1 = [2, -1, 0, 1]
x2 = [1, 0, 1, 0]
x3 = [1, 1, 0, 0]
x4 = [1, 2, -1, 2]

X = torch.tensor([x1, x2, x3, x4], dtype=torch.float32)

Y = rnn(X)
print(Y)

[tensor(0.9988), tensor(0.9960), tensor(0.9945), tensor(0.9933)]


# LSTM

In [4]:
def LSTM(X):
    """
    Sequence length: 4
    Input dim: 3
    Hidden state dim: 2
    """

    # init forget gate
    FG = torch.tensor([[0, -1, 0, 0, -2, 0],
                       [-2, 0, -2, -2, 0, 0]], dtype=torch.float32)

    # Init Candidate memory
    CM = torch.tensor([[0, 1, 0, 0, -1, 0],
                       [-1, 0, -1, -4, 0, 1]], dtype=torch.float32)

    # Init Input gate
    IG = torch.tensor([[2, 0, 3, 1, 0, 1],
                       [0, 1, 0, 0, 2, 0]], dtype=torch.float32)

    # Init output gate
    OG = torch.tensor([[1, 1, 0, 4, 1, 0],
                       [0, 0, -1, -1, -2, 4]], dtype=torch.float32)
    
    # Set initial hidden state (short term memory)
    H = torch.tensor([0, 0, 1], dtype=torch.float32)

    # Set initial long term memory
    LM = torch.tensor([0, 0])

    # Init output list 
    output = []

    # Loop through tokens
    for x in X:
        # Concat hidden state
        x_H = torch.cat([x, H], dim=0)

        # Perform matrix operations with each weight matrix
        x_FG = torch.sigmoid(torch.matmul(FG, x_H))
        x_CM = torch.tanh(torch.matmul(CM, x_H))
        x_IG = torch.sigmoid(torch.matmul(IG, x_H))

        # Multiply and add to update long term memory 
        LM = (LM*x_FG) + (x_CM*x_IG)

        # Take tan hedge
        LM_tanh = torch.tanh(LM)

        # Multiply with output gate sigmoid to update hidden state
        x_OG = torch.sigmoid(torch.matmul(OG, x_H))
        H = LM_tanh * x_OG

        # Appened to output list
        output.append(H)

        # Add bias term back
        H = torch.cat([H, torch.tensor([1.0])])

    return output

In [5]:
# Ex sequence
x1 = [1, 2, 1]
x2 = [2, 1, 0]
x3 = [0, -1, 3]
x4 = [-1, 0, 2]

X = torch.tensor([x1, x2, x3, x4]) #, dtype=torch.float32)

Y = LSTM(X)
print(Y)

[tensor([ 0.7097, -0.5577]), tensor([ 0.8852, -0.4358]), tensor([ 0.5500, -0.0967]), tensor([ 0.3415, -0.3512])]


# Self-attention

In [6]:
def self_attention(X):
    """
    Takes in 6x4 matrix 
    Columns of X repressent tokens
    Output is attention weighted features matrix
    """
    # Define W_Q
    W_Q = torch.tensor([[1, 1, 0, 0, 0, 0],
                       [0, 1, 0, 1, 0, 0],
                       [0, 0, 1, 0, 1, 1]], dtype=torch.float32)
    
    # Define W_K
    W_K = torch.tensor([[0, 0, 1, 0, 0, 0],
                       [0, 1, 0, 0, 0, 0],
                       [1, 0, 0, 0, 0, -1]], dtype=torch.float32)
    
    # Define W_V
    W_V = torch.tensor([[10, 0, 0, 0, 0, 0],
                       [0, 0, 0, 10, 0, 0],
                       [0, 10, 0, 0, 0, 0]], dtype=torch.float32)
    
    # Matrix mult operations
    Q = torch.matmul(W_Q, X)
    K = torch.matmul(W_K, X)
    
    # Get key dimesion (just 3 for this implementation)
    k = W_K.size(0)
    
    # Matrix multiply W_Q and W_K and divide by k to scale
    r = torch.matmul(K.T, Q) / np.sqrt(k)

    # Take the softmax to get attention weight matrix
    A = torch.softmax(r, dim=0)

    # Get attention weighted feature embeddings
    V = torch.matmul(W_V, X)
    Z = torch.matmul(V, A)

    return Z


In [7]:
# Ex sequence
x1 = [2, 0, 0, 0, 2, 1]
x2 = [0, 1, 2, 0, 0, 0]
x3 = [0, 0, 1, 1, 0, 1]
x4 = [2, 0, 0, 1, 0, 1]

X = torch.tensor([x1, x2, x3, x4], dtype=torch.float32).T

Z = self_attention(X)
print(Z)

tensor([[10.3076, 10.1055, 15.0336,  3.0608],
        [ 2.8328,  2.9733,  4.1317,  1.5304],
        [ 4.5903,  4.5003,  2.1099,  7.7044]])


# Problem 2
Compute perplexity:

$\exp(-\frac{1}{n}\sum \log P(x_t|x_{t-1}, ... ,x_1))$

In this problem, we have:

$\exp(-\frac{1}{5}\sum [\log(0.45)+\log(0.1)+\log(0.2)+\log(0.1)+\log(0.4)])$

$=\exp(-\frac{1}{5}(-7.9294))=\exp(1.5859)=4.8836$

So we have

$Perplexity=4.8836$