In [1]:
!pip install geoopt

Collecting geoopt
  Downloading geoopt-0.5.0-py3-none-any.whl.metadata (6.7 kB)
Downloading geoopt-0.5.0-py3-none-any.whl (90 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geoopt
Successfully installed geoopt-0.5.0


In [2]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [3]:
import torch
import torch.nn as nn
import geoopt
from nltk.corpus import treebank
from nltk.tree import Tree
from collections import Counter

def extract_sequences(tree):
    words = tree.leaves()
    labels = []

    def traverse(t):
        if isinstance(t, Tree):
            if len(t) == 1 and isinstance(t[0], str):
                labels.append(t.label())
            else:
                for child in t:
                    traverse(child)

    traverse(tree)
    return words, labels

trees = treebank.parsed_sents()
word_sequences, label_sequences = [], []

for tree in trees:
    words, labels = extract_sequences(tree)
    word_sequences.append(words)
    label_sequences.append(labels)

word_vocab = Counter(word for words in word_sequences for word in words)
label_vocab = Counter(label for labels in label_sequences for label in labels)

for words, labels in zip(word_sequences, label_sequences):
    assert len(words) == len(labels), f"Length mismatch: {len(words)} words, {len(labels)} labels"

word2idx = {word: i for i, word in enumerate(word_vocab.keys(), start=1)}
label2idx = {label: i for i, label in enumerate(label_vocab.keys(), start=1)}

word2idx['<PAD>'] = 0
word2idx['<UNK>'] = len(word2idx)
label2idx['<PAD>'] = 0
label2idx['<UNK>'] = len(label2idx)

def convert_to_indices(sequences, vocab):
    return [[vocab.get(token, vocab['<UNK>']) for token in seq] for seq in sequences]

word_sequences_idx = convert_to_indices(word_sequences, word2idx)
label_sequences_idx = convert_to_indices(label_sequences, label2idx)


In [4]:
class HyperbolicEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, curvature=1.0):
        super(HyperbolicEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.manifold = geoopt.PoincareBall(c=curvature)

    def forward(self, x):
        euclidean_embeddings = self.embedding(x)
        hyperbolic_embeddings = self.manifold.expmap0(euclidean_embeddings)
        return hyperbolic_embeddings

class HyperbolicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, curvature=1.0):
        super(HyperbolicRNN, self).__init__()
        self.hyperbolic_embedding = HyperbolicEmbedding(vocab_size, embedding_dim, curvature)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.hyperbolic_embedding(x)
        rnn_out, hidden = self.rnn(embedded)
        output = self.fc(rnn_out)
        return output

vocab_size = len(word2idx)
label_vocab_size = len(label2idx)
embedding_dim = 16
hidden_dim = 32
curvature = 1.0

model = HyperbolicRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_dim=label_vocab_size,
    curvature=curvature
)

def pad_sequences(sequences, max_len, pad_value=0):
    return [seq + [pad_value] * (max_len - len(seq)) for seq in sequences]

max_len = max(len(seq) for seq in word_sequences_idx)
word_sequences_padded = torch.tensor(pad_sequences(word_sequences_idx, max_len), dtype=torch.long)
label_sequences_padded = torch.tensor(pad_sequences(label_sequences_idx, max_len), dtype=torch.long)

train_data = list(zip(word_sequences_padded, label_sequences_padded))

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [6]:
epochs = 50
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for input_seq, target_seq in train_data:
        optimizer.zero_grad()
        output = model(input_seq.unsqueeze(0))
        loss = criterion(output.view(-1, label_vocab_size), target_seq.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/50, Loss: 4148.1001
Epoch 2/50, Loss: 3154.3222
Epoch 3/50, Loss: 2528.5448
Epoch 4/50, Loss: 2092.5789
Epoch 5/50, Loss: 1766.1781
Epoch 6/50, Loss: 1509.7136
Epoch 7/50, Loss: 1302.3645
Epoch 8/50, Loss: 1132.3349
Epoch 9/50, Loss: 990.5759
Epoch 10/50, Loss: 870.2556
Epoch 11/50, Loss: 766.6083
Epoch 12/50, Loss: 676.2856
Epoch 13/50, Loss: 596.9731
Epoch 14/50, Loss: 526.9695
Epoch 15/50, Loss: 464.7245
Epoch 16/50, Loss: 409.4416
Epoch 17/50, Loss: 360.3025
Epoch 18/50, Loss: 316.8306
Epoch 19/50, Loss: 278.5631
Epoch 20/50, Loss: 245.1176
Epoch 21/50, Loss: 215.8396
Epoch 22/50, Loss: 189.8643
Epoch 23/50, Loss: 167.1349
Epoch 24/50, Loss: 146.9950
Epoch 25/50, Loss: 130.1513
Epoch 26/50, Loss: 115.1602
Epoch 27/50, Loss: 101.2121
Epoch 28/50, Loss: 90.7339
Epoch 29/50, Loss: 80.5238
Epoch 30/50, Loss: 74.3471
Epoch 31/50, Loss: 67.1306
Epoch 32/50, Loss: 60.6303
Epoch 33/50, Loss: 56.9719
Epoch 34/50, Loss: 52.6460
Epoch 35/50, Loss: 47.9382
Epoch 36/50, Loss: 46.5309
Ep

In [14]:
def evaluate(model, input_seq):
    model.eval()
    with torch.no_grad():
        output = model(input_seq.unsqueeze(0))
        predicted = output.argmax(dim=-1)
    return predicted

In [16]:
custom_sequence = "Christian are running in the park"
custom_sequence = custom_sequence.split(" ")
input_indices = [word2idx[word] for word in custom_sequence if word in word2idx]

sequence_length = 6

input_indices_padded = input_indices + [word2idx["<PAD>"]] * (sequence_length - len(input_indices))

input_indices_padded = input_indices_padded[:sequence_length]

input_tensor = torch.tensor(input_indices_padded)

predicted_labels = evaluate(model, input_tensor)

print("Input Words:", custom_sequence)
print("Predicted Labels:", [label for label, idx in label2idx.items() if idx in predicted_labels[0]])

Input Words: ['Christian', 'are', 'running', 'in', 'the', 'park']
Predicted Labels: ['NNP', 'DT', 'NN', 'IN', 'VBG', 'PRP$']
