In [3]:
!pip install spacy torch numpy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:

import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.nn.utils.rnn import pad_sequence

nlp = spacy.load("en_core_web_sm")

sentences = [
    "I love machine learning",
    "Natural language processing is fun",
    "The cat sat on the mat",
    "Dogs are loyal animals"
]

vocab = set()
dep_relations = set()

tokenized_sentences = []
dependency_labels = []

for sentence in sentences:
    doc = nlp(sentence)
    tokenized_sentences.append([token.text for token in doc])
    dependency_labels.append([token.dep_ for token in doc])
    vocab.update([token.text for token in doc])
    dep_relations.update([token.dep_ for token in doc])

word2idx = {word: i for i, word in enumerate(vocab)}
dep2idx = {dep: i for i, dep in enumerate(dep_relations)}

word2idx['<PAD>'] = len(word2idx)
dep2idx['<PAD>'] = len(dep2idx)

class DependencyRNN(nn.Module):
    def __init__(self, vocab_size, dep_size, embed_size=50, hidden_size=128):
        super(DependencyRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, dep_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out)
        return out

vocab_size = len(word2idx)
dep_size = len(dep2idx)

model = DependencyRNN(vocab_size, dep_size)
criterion = nn.CrossEntropyLoss(ignore_index=dep2idx['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=0.001)

def prepare_data(sentences, labels, word2idx, dep2idx):
    X = [torch.tensor([word2idx[word] for word in sentence], dtype=torch.long) for sentence in sentences]
    y = [torch.tensor([dep2idx[label] for label in label_seq], dtype=torch.long) for label_seq in labels]

    X_padded = pad_sequence(X, batch_first=True, padding_value=word2idx['<PAD>'])
    y_padded = pad_sequence(y, batch_first=True, padding_value=dep2idx['<PAD>'])

    return X_padded, y_padded

X, y = prepare_data(tokenized_sentences, dependency_labels, word2idx, dep2idx)


epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X)

    output = output.view(-1, dep_size)
    y_flat = y.view(-1)

    loss = criterion(output, y_flat)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

def predict(sentence, model, word2idx, dep2idx):
    model.eval()
    tokens = [word2idx.get(word, word2idx['<PAD>']) for word in sentence.split()]
    X = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)  # Batch of 1

    output = model(X)
    _, predicted = torch.max(output, 2)
    predicted = predicted.squeeze(0).tolist()

    idx2dep = {i: dep for dep, i in dep2idx.items()}
    predicted_deps = [idx2dep.get(idx, '<PAD>') for idx in predicted]

    return predicted_deps

test_sentence = "I enjoy learning"
predicted_dependencies = predict(test_sentence, model, word2idx, dep2idx)

print(f"Sentence: {test_sentence}")
print(f"Predicted Dependencies: {predicted_dependencies}")


Epoch [10/100], Loss: 1.3056
Epoch [20/100], Loss: 0.5207
Epoch [30/100], Loss: 0.1925
Epoch [40/100], Loss: 0.0800
Epoch [50/100], Loss: 0.0419
Epoch [60/100], Loss: 0.0271
Epoch [70/100], Loss: 0.0200
Epoch [80/100], Loss: 0.0159
Epoch [90/100], Loss: 0.0132
Epoch [100/100], Loss: 0.0113
Sentence: I enjoy learning
Predicted Dependencies: ['nsubj', 'ROOT', 'compound']
