# Language Modeling


Whether for transcribing spoken utterances as correct word sequences or generating coherent human-like text, language models are extremely useful.

In this assignment, you will be building your own language models powered by n-grams and RNNs.

In [1]:
# !unzip data.zip

In [2]:
from utils import *

### Step 2: RNN Language Model


#### Preparing the Data
The following Python code is used for loading and processing [GloVe (Global Vectors for Word Representation) embeddings](https://nlp.stanford.edu/projects/glove/). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. These embeddings can be used in various natural language processing and machine learning tasks.

The `load_glove_embeddings(path)` function is used to load the GloVe embeddings from a file. The function takes a file path as an argument, reads the file line by line, and for each line, it splits the line into words and their corresponding embeddings, and stores them in a dictionary. The dictionary, embeddings_dict, maps words to their corresponding vector representations.

The `create_embedding_matrix(word_to_ix, embeddings_dict, embedding_dim)` function is used to create an embedding matrix from the loaded GloVe embeddings. This function takes a dictionary mapping words to their indices (`word_to_ix`), the dictionary of GloVe embeddings (`embeddings_dict`), and the dimension of the embeddings (`embedding_dim`) as arguments. It creates a zero matrix of size (vocab_size, embedding_dim) and then for each word in  `word_to_ix`, it checks if the word is in `embeddings_dict`. If it is, it assigns the corresponding GloVe vector to the word's index in the embedding matrix. If the word is not in the embeddings_dict, it assigns a random vector to the word's index in the embedding matrix.

The `glove_path` variable is the path to the GloVe file, and `glove_embeddings` is the dictionary of GloVe embeddings loaded using the `load_glove_embeddings` function. The `embedding_dim` variable is the dimension of the embeddings, and `embedding_matrix` is the embedding matrix created using the create_embedding_matrix function.

In [3]:
# Load the data
# vocab, word_to_ix, ix_to_word, dataloader = loadfile("data/sample.txt")
vocab, word_to_ix, ix_to_word, dataloader = loadfile("data/lyrics/ed_sheeran.txt")

In [4]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]], dtype=torch.float)
            embeddings_dict[word] = vector
    return embeddings_dict

# Path to the GloVe file
glove_path = 'glove.6B.50d.txt'  # Update this path
glove_embeddings = load_glove_embeddings(glove_path)

def create_embedding_matrix(word_to_ix, embeddings_dict, embedding_dim):
    vocab_size = len(word_to_ix)
    embedding_matrix = torch.zeros((vocab_size, embedding_dim))
    for word, ix in word_to_ix.items():
        if word in embeddings_dict:
            embedding_matrix[ix] = embeddings_dict[word]
        else:
            embedding_matrix[ix] = torch.rand(embedding_dim)  # Random initialization for words not in GloVe
    return embedding_matrix

# Create the embedding matrix
embedding_dim = 50
embedding_matrix = create_embedding_matrix(word_to_ix, glove_embeddings, embedding_dim)

#### TO DO: Defining the RNN Model

In [5]:
#######################################
# TODO: RNNLanguageModel()
#######################################

import math
import torch
import numpy as np
import torch.nn as nn
from collections import Counter
from torch.utils.data import DataLoader, Dataset

class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        """
        RNN language model con GRU y embeddings GloVe.
        """
        super().__init__()
        self.device = torch.device(
            "mps" if torch.backends.mps.is_available()
            else "cuda" if torch.cuda.is_available()
            else "cpu"
        )
        print(f"Using device: {self.device}")

        # Embedding inicializado con GloVe
        # embedding_matrix: torch.Tensor [vocab_size, embedding_dim]
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        with torch.no_grad():
            self.embedding.weight.copy_(embedding_matrix)
        self.embedding.weight.requires_grad = False  # congelar embeddings

        # GRU unidireccional
        self.hidden_dim = hidden_dim
        self.rnn = nn.GRU(input_size=embedding_dim,
                          hidden_size=hidden_dim,
                          num_layers=1,
                          batch_first=True)
        
        self.layer_norm = nn.LayerNorm(hidden_dim)

        self.dropout = nn.Dropout(0.2)

        # Capa final a vocab
        self.fc = nn.Linear(hidden_dim, vocab_size)

        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

        self.to(self.device)

    def forward(self, x, hidden=None):
        """
        x: [B, T] índices
        hidden: [1, B, H] opcional
        retorna: logits [B, T, V], hidden
        """
        x = x.to(self.device)
        if hidden is not None:
            hidden = hidden.to(self.device)

        emb = self.embedding(x)            # [B, T, D]
        emb = self.dropout(emb)
        out, hidden = self.rnn(emb, hidden)  # out: [B, T, H]
        out = self.layer_norm(out)  
        out = self.dropout(out)
        logits = self.fc(out)              # [B, T, V]
        return logits, hidden

    @torch.no_grad()
    def generate_sentence(self, sequence, word_to_ix, ix_to_word, num_words, mode='max'):
        """
        Autoregresivo desde la secuencia dada.
        Usa último token como condición y mantiene el hidden.
        """
        self.eval()

        # tokens iniciales
        tokens = sequence.strip().split()
        # map a ids con UNK si no está
        unk = UNK if 'UNK' in globals() else '<unk>'
        start_ids = [word_to_ix.get(w, word_to_ix.get(unk, 0)) for w in tokens]
        if len(start_ids) == 0:
            # si vacío, inicia con <s> si existe
            start_ids = [word_to_ix.get(START, 0)]

        # construir estado inicial ejecutando la secuencia
        x = torch.tensor(start_ids, dtype=torch.long, device=self.device).unsqueeze(0)  # [1, T]
        logits, hidden = self.forward(x)  # oculto después de la secuencia

        generated = []
        last_id = x[0, -1].unsqueeze(0).unsqueeze(0)  # [1,1]

        for _ in range(num_words):
            logits, hidden = self.forward(last_id, hidden)  # [1,1,V]
            probs = torch.softmax(logits[0, -1], dim=-1)

            if mode == 'multinomial':
                next_id = torch.multinomial(probs, num_samples=1)
            else:
                next_id = torch.argmax(probs, dim=-1, keepdim=True)

            wid = next_id.item()
            word = ix_to_word.get(wid, unk)
            generated.append(word)

            # parar si EOS
            if word == EOS:
                break

            last_id = next_id.view(1, 1)

        return generated


#### Training the Model
The following code snippet provided is responsible for training the RNN language model.

In [6]:
#######################################
# TEST: RNNLanguageModel() and training
#######################################
torch.manual_seed(11411)
# Hyperparameters
vocab_size = len(vocab)
embedding_dim = 50
hidden_dim = 32
num_epochs = 50

# Initialize the model, loss function, and optimizer
RNN = RNNLanguageModel(vocab_size, embedding_dim, hidden_dim, embedding_matrix)
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(RNN.parameters(), lr=0.005)
optimizer = torch.optim.Adam(RNN.parameters(), lr=0.001)

lines = ""
# Training loop
for epoch in range(num_epochs):
    RNN.train()
    total_loss = 0.0
    num_batches = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(RNN.device)
        targets = targets.to(RNN.device)

        RNN.zero_grad()
        output, _ = RNN(inputs)
        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    avg_loss = total_loss / num_batches
    line = f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}, Perplexity: {np.exp(avg_loss)}'
    lines += line + "\n"
    print(line)

Using device: cuda


Epoch 1/50, Loss: 3.93922800560455, Perplexity: 51.378921746668304
Epoch 2/50, Loss: 3.634114541187875, Perplexity: 37.868307221347365
Epoch 3/50, Loss: 3.574944833012208, Perplexity: 35.69265170526054
Epoch 4/50, Loss: 3.5450234194314785, Perplexity: 34.640496593497645
Epoch 5/50, Loss: 3.523637490110477, Perplexity: 33.90754276605612
Epoch 6/50, Loss: 3.509207688482624, Perplexity: 33.42177683207238
Epoch 7/50, Loss: 3.494498620317437, Perplexity: 32.93377148894612
Epoch 8/50, Loss: 3.4836063036604847, Perplexity: 32.57699302062646
Epoch 9/50, Loss: 3.476397206437741, Perplexity: 32.342986810936445
Epoch 10/50, Loss: 3.470231407952658, Perplexity: 32.144180003049264
Epoch 11/50, Loss: 3.4652247427151623, Perplexity: 31.983647057135837
Epoch 12/50, Loss: 3.4590095724175787, Perplexity: 31.785479703768175
Epoch 13/50, Loss: 3.4562750355419523, Perplexity: 31.69867987017016
Epoch 14/50, Loss: 3.4535726746328606, Perplexity: 31.61313423646732
Epoch 15/50, Loss: 3.449323920436473, Perplex

In [17]:
# Set the model to evaluation mode
RNN.eval()

# Example starting sequences
start_sequences = [
    "I love to",
    "The sun is",
    "When I was",
    "She said",
    ""  # Empty string (<s> token)
]

for sequence in start_sequences:
    print(f"\nStarting sequence: '{sequence}'")
    
    # Generate with greedy decoding (max probability)
    generated_max = RNN.generate_sentence(
        sequence=sequence,
        word_to_ix=word_to_ix,
        ix_to_word=ix_to_word,
        num_words=15,
        mode='max'
    )
    
    # Generate with sampling (multinomial)
    generated_sample = RNN.generate_sentence(
        sequence=sequence,
        word_to_ix=word_to_ix,
        ix_to_word=ix_to_word,
        num_words=15,
        mode='multinomial'
    )
    
    # Combine starting sequence with generated words
    if sequence.strip():
        full_text_max = sequence + " " + " ".join(generated_max)
        full_text_sample = sequence + " " + " ".join(generated_sample)
    else:
        full_text_max = " ".join(generated_max)
        full_text_sample = " ".join(generated_sample)
    
    print(f"Mode max: \t{full_text_max}")
    print(f"Mode multinomial: \t{full_text_sample}")



Starting sequence: 'I love to'
Mode max: 	I love to the sky </s>
Mode multinomial: 	I love to be </s>

Starting sequence: 'The sun is'
Mode max: 	The sun is </s>
Mode multinomial: 	The sun is something you want do </s>

Starting sequence: 'When I was'
Mode max: 	When I was </s>
Mode multinomial: 	When I was flicking </s>

Starting sequence: 'She said'
Mode max: 	She said </s>
Mode multinomial: 	She said now s now to my have i </s>

Starting sequence: ''
Mode max: 	and i m just in the way </s>
Mode multinomial: 	man taking in i take me </s>
