In [26]:
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

ptb = load_dataset('ptb-text-only/ptb_text_only', trust_remote_code=True)

train = ptb['train']
val = ptb['validation']
test = ptb['test']

In [27]:
def load_glove_embeddings(glove_file):
    glove_embeddings = {}

    with open(glove_file, 'r') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtypre=np.float32)
            glove_embeddings[word] = vector

    return glove_embeddings

In [28]:
def preprocess_text(text, glove_embeddings, unknown_token='UNK', stopwords=None):
    tokens = word_tokenize(text.lower())

    if stopwords is not None:
        tokens = [word for word in tokens if word not in stopwords]

    embedding_sequence = []
    for token in tokens:
        if token in glove_embeddings:
            embedding_sequence.append(glove_embeddings[token])
        else:
            embedding_sequence.append(glove_embeddings.get(unknown_token, np.zeros_like(next(iter(glove_embeddings.values())))))

    return embedding_sequence

In [29]:
class TextDataset(Dataset):
    def __init__(self, text_data, glove_embeddings):
        self.text_data = text_data
        self.glove_embeddings = glove_embeddings

    def __len__(self):
        return len(self.text_data)
    
    def __getitem__(self, idx):
        text = self.text_data[idx]
        embedding_sequence = preprocess_text(text, self.glove_embedings)
        return torch.tensor(embedding_sequence, dtype=torch.float32)

In [30]:
class LearnedPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.pos_embedding = nn.Embedding(max_len, d_model)

    def forward(self, x):  # x: (batch, seq_len, d_model)
        pos = torch.arange(x.size(1), device=x.device).view(1, x.size(1))  # (1, seq_len)
        embedding = self.pos_embedding(pos)  # (1, seq_len, d_model)
        return x + embedding    

In [31]:
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, num_layers, dropout=0.1):
        super(TransformerLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = LearnedPositionalEncoding(d_model)

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(d_model, n_head)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=n_head,
                dim_feedforward=2048,
                dropout=dropout
            ),
            num_layers=num_layers
        )
        
        # Output layer
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.vocab_size = vocab_size

    def forward(self, x):   # x: (batch, seq_len)
        # Embedding and positional encoding
        x = self.embedding(x)   # (batch, seq_len, d_model)
        x = self.pos_encoding(x)  # (batch, seq_len, d_model)

        mask = nn.Transformer.generate_square_subsequent_mask(x.size(1)).to(x.device)   # (seq_len, seq_len)
        out = self.transformer_encoder(x, mask=mask)    # (batch, seq_len, d_model)
        
        return out

In [32]:
def compute_perplexity(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    num_batches = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            output = model(batch)
            loss = criterion(output.view(-1, output.shape[-1]), batch.view(-1))
            total_loss += loss.item()
            num_batches += 1

    perplexity = np.exp(total_loss / num_batches)
    return perplexity

In [33]:
glove_path = 'glove-twitter-25'
glove_embeddings = load_glove_embeddings(glove_path)

dataset = TextDataset(train, glove_embeddings)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

input_dim = len(glove_embeddings)
model_dim = 100
n_heads = 2
num_layers = 2
n_classes = len(glove_embeddings)

model = TransformerLM(input_dim, model_dim, n_heads, num_layers, n_classes)
model.to(torch.device('cuda' if torch.cude.is_available() else 'cpu'))

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

#Train the model
model.train()
for epoch in range(2):
    total_loss = 0
    for batch in dataloader:
        batch = batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output.view(-1, output.shape[-1], batch.view(-1)))

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

perplexity = compute_perplexity(model, dataloader, criterion, torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
print(f"Perplexity: {perplexity}")


FileNotFoundError: [Errno 2] No such file or directory: 'glove-twitter-25'