<a href="https://colab.research.google.com/github/ghmaria-ch/fsopart4/blob/main/RecursiveNN_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install nltk




In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
import numpy as np
import random

from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize


In [10]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.raw(fileid), category))

random.shuffle(documents)

print("Total samples:", len(documents))


Total samples: 2000


In [12]:
all_words = []

for text, _ in documents:
    words = word_tokenize(text.lower())
    all_words.extend(words)

all_words = sorted(set(all_words))

word_to_ix = {word: i for i, word in enumerate(all_words)}

print("Vocabulary size:", len(word_to_ix))



Vocabulary size: 46462


In [13]:
class TreeNode:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None


In [14]:
def build_tree(words):
    if len(words) == 1:
        index = word_to_ix.get(words[0], 0)
        tensor = torch.zeros(1, len(word_to_ix))
        tensor[0][index] = 1
        return TreeNode(tensor)

    mid = len(words) // 2

    node = TreeNode(None)
    node.left = build_tree(words[:mid])
    node.right = build_tree(words[mid:])
    return node


In [15]:
class RecursiveNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size):
        super(RecursiveNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Linear(vocab_size, hidden_size)
        self.combine = nn.Linear(2 * hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, output_size)

    def forward(self, node):
        if node.left is None and node.right is None:
            return torch.tanh(self.embedding(node.value))

        left_hidden = self.forward(node.left)
        right_hidden = self.forward(node.right)

        combined = torch.cat((left_hidden, right_hidden), dim=1)
        return torch.tanh(self.combine(combined))


In [16]:
vocab_size = len(word_to_ix)
hidden_size = 128
output_size = 2

model = RecursiveNN(vocab_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [17]:
for epoch in range(3):
    total_loss = 0

    for text, label in documents[:100]:
        words = word_tokenize(text.lower())[:10]  # limit words

        tree = build_tree(words)

        optimizer.zero_grad()

        hidden = model(tree)
        output = model.classifier(hidden)

        target = torch.tensor([0 if label == 'neg' else 1])

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 70.1044
Epoch 2, Loss: 38.3297
Epoch 3, Loss: 2.0921


In [21]:
sample_text = "This movie was absolutely fantastic and amazing"
words = word_tokenize(sample_text.lower())

tree = build_tree(words)
hidden = model(tree)
output = model.classifier(hidden)

prediction = torch.argmax(output)

print("Prediction:", "Positive" if prediction.item() == 1 else "Negative")


Prediction: Positive


In [20]:
sample_text = "This movie was absolutely terrible, boring, and a complete waste of time."
words = word_tokenize(sample_text.lower())

tree = build_tree(words)
hidden = model(tree)
output = model.classifier(hidden)

prediction = torch.argmax(output)

print("Prediction:", "Positive" if prediction.item() == 1 else "Negative")


Prediction: Negative
