In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from DataGenerator import DataGenerator
from AudioEncoder import AudioEncoder
from Decoder import Decoder

# For GPU-accelerated training on apple silicon
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    print(torch.ones(1, device=mps_device))
else: print ("MPS device not found.")

gen = DataGenerator(word_count=10000, batch_size=30)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/danieldager/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


tensor([1.], device='mps:0')
Function sample_words Took 1.3797 seconds
Function process_dataset Took 0.0317 seconds
Function clean_and_enrich_data Took 0.6202 seconds
Function process_dataset Took 0.0055 seconds
Function clean_and_enrich_data Took 0.6801 seconds
Function get_evaluation_data Took 1.3391 seconds


In [2]:
train_dl, eval_dl, SEQ_LENGTH, VOCAB_SIZE, index_to_phoneme = gen.get_phoneme_dataloaders()
print(f"SEQ_LENGTH: {SEQ_LENGTH}, VOCAB_SIZE: {VOCAB_SIZE}")

Function get_phoneme_dataloaders Took 0.1721 seconds
SEQ_LENGTH: 15, VOCAB_SIZE: 73


In [3]:
# Inspect elements of the dataloader
# Look for pad and stop tokens
for i, (x, y) in enumerate(train_dl):
    print(x.shape, y.shape)
    print(x[0])
    print(y[0])
    break


torch.Size([30, 15]) torch.Size([30, 15])
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15,  9,  5, 73])
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15,  9,  5, 73])


In [4]:
# Hyperparameters
NUM_EPOCHS = 10
BATCH_SIZE = 30
HIDDEN_SIZE = 256
DROPOUT = 0.0
NUM_LAYERS = 1
LEARNING_RATE = 1e-3
TEACHER_FORCING_RATIO = 0.5

# Initialize models, loss function, optimizer
encoder = AudioEncoder(
    input_size=VOCAB_SIZE, hidden_size=HIDDEN_SIZE, batch_size=BATCH_SIZE,
    num_layers=NUM_LAYERS, dropout=DROPOUT
)

decoder = Decoder(
    hidden_size=HIDDEN_SIZE, output_size=SEQ_LENGTH, batch_size=BATCH_SIZE,
    num_layers=NUM_LAYERS, dropout=DROPOUT
)

In [5]:
# Test Models
x = torch.zeros(BATCH_SIZE, SEQ_LENGTH, dtype=torch.int)
print(f"Input: {x.shape}")

hidden = encoder(x)
print(f"Encoder hidden: {hidden.shape}")

start = torch.zeros(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
output = decoder(start, hidden)
print(f"Decoder output: {output.shape}")
print(f"Decoder output argmax: {torch.argmax(output, dim=-1).shape}")
print(f"Decoder output: {output}")
print(f"Decoder output argmax: {torch.argmax(output, dim=-1)}")

Input: torch.Size([30, 15])
Encoder hidden: torch.Size([1, 15, 256])
Decoder output: torch.Size([30, 15, 15])
Decoder output argmax: torch.Size([30, 15])
Decoder output: tensor([[[ 0.1285, -0.0785,  0.0774,  ...,  0.1036, -0.0314, -0.1492],
         [ 0.1285, -0.0785,  0.0774,  ...,  0.1036, -0.0314, -0.1492],
         [ 0.1285, -0.0785,  0.0774,  ...,  0.1036, -0.0314, -0.1492],
         ...,
         [ 0.1285, -0.0785,  0.0774,  ...,  0.1036, -0.0314, -0.1492],
         [ 0.1285, -0.0785,  0.0774,  ...,  0.1036, -0.0314, -0.1492],
         [ 0.1285, -0.0785,  0.0774,  ...,  0.1036, -0.0314, -0.1492]],

        [[ 0.0706, -0.0925,  0.0699,  ...,  0.0679, -0.0006, -0.0701],
         [ 0.0706, -0.0925,  0.0699,  ...,  0.0679, -0.0006, -0.0701],
         [ 0.0706, -0.0925,  0.0699,  ...,  0.0679, -0.0006, -0.0701],
         ...,
         [ 0.0706, -0.0925,  0.0699,  ...,  0.0679, -0.0006, -0.0701],
         [ 0.0706, -0.0925,  0.0699,  ...,  0.0679, -0.0006, -0.0701],
         [ 0.0706, 

In [11]:
loss_fn = nn.CrossEntropyLoss() # might want to try focal loss to deal with class imbalance
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=LEARNING_RATE)

# Training loop
for epoch in range(NUM_EPOCHS):
    for inputs, targets in train_dl:

        # Zero gradients from previous step
        optimizer.zero_grad()

        print(f"Inputs: {inputs.shape}")

        # Encoder forward pass
        encoder_hidden = encoder(inputs) # [num_layers, seq_length (not batch_size?), hidden_size]

        # Initialize decoder input
        decoder_input = torch.zeros(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
        
        # Decoder forward pass
        decoder_output = decoder(None, encoder_hidden)

        # Compute loss and backpropagate
        loss = loss_fn(decoder_output.squeeze(1), targets.float())
        loss.backward()
        optimizer.step()

        # If we want to use teacher forcing, we need to iterate through the target sequence
        # Initialize decoder hidden state as encoder's final hidden state
        # decoder_hidden = encoder_hidden
        # for t in range(targets.size(1)):  # for each time step
        #     # Decoder forward pass (at each time step)
        #     decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        #     # Compute loss (comparing decoder output with the true target at this time step)
        #     loss += loss_fn(decoder_output.squeeze(1), targets[:, t])

        #     # Optionally use teacher forcing (use the true target as the next input)
        #     teacher_force = random.random() < TEACHER_FORCING_RATIO
        #     decoder_input = targets[:, t].unsqueeze(1) if teacher_force else decoder_output.argmax(dim=2)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item()/targets.size(1)}")

Inputs: torch.Size([30, 18])
torch.Size([30, 18])


RuntimeError: Expected target size [30, 73], got [30, 18]

In [None]:
# Evaluation loop

for inputs, targets in train_dl:

    # Encoder forward pass
    encoder_hidden = encoder(inputs)

    # Initialize decoder input
    decoder_input = torch.zeros(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
    
    # Decoder forward pass
    decoder_output = decoder(decoder_input, encoder_hidden)

    # Compute loss and backpropagate
    loss = loss_fn(decoder_output.squeeze(1), targets.float())
    loss.backward()
    optimizer.step()

print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item()/targets.size(1)}")