In [30]:
# Program for sentiment analysis of synthetic Rotten Tomatoes reviews for The Matrix
# Uses generated dataset of 50 reviews (48 movie reviews + 2 reference texts)
# Implements: tokenization, token embeddings, sentiment prediction with frozen BERT and custom layer
# Requirements: pip install transformers torch pandas

# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import csv
from sklearn.model_selection import train_test_split

In [31]:
# Load dataset
df = pd.read_csv('matrix_reviews.csv', encoding='utf-8')
df[:5]

Unnamed: 0,id,phrase,sentiment
0,1,"The Matrix is great, revolutionary sci-fi that...",positive
1,2,"Terrible movie, The Matrix’s plot is so confus...",negative
2,3,"The Matrix was okay, entertaining but not life...",neutral
3,4,Great visuals and action in The Matrix make it...,positive
4,5,Hated The Matrix; terrible pacing and a story ...,negative


In [32]:
# Filter out reference texts (id 49, 50) for sentiment prediction
df_reviews = df[df['id'] <= 48].copy()
texts = df['phrase'].tolist()  # All texts for tokenization/embeddings
labels = df_reviews['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2}).values  # Encode labels

In [33]:
# Initialize BERT tokenizer and model (frozen)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  # Load tokenizer
model = AutoModel.from_pretrained('bert-base-uncased')          # Load model for embeddings
model.eval()  # Set to evaluation mode (no training)

# Step 1: Tokenization - Process all texts and store tokens
all_tokens = []
for text in texts[:5]:  # Show first 5 for brevity
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)  # Tokenize
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])              # Get tokens
    all_tokens.append(tokens)
    print(f"\nTokens for '{text}':")
    print(tokens)
    print(f"Token length {len(tokens)}")


Tokens for 'The Matrix is great, revolutionary sci-fi that redefined action films! #mindblown':
['[CLS]', 'the', 'matrix', 'is', 'great', ',', 'revolutionary', 'sci', '-', 'fi', 'that', 'red', '##efined', 'action', 'films', '!', '#', 'mind', '##bl', '##own', '[SEP]']
Token length 21

Tokens for 'Terrible movie, The Matrix’s plot is so confusing and overrated. #disappointed':
['[CLS]', 'terrible', 'movie', ',', 'the', 'matrix', '’', 's', 'plot', 'is', 'so', 'confusing', 'and', 'over', '##rated', '.', '#', 'disappointed', '[SEP]']
Token length 19

Tokens for 'The Matrix was okay, entertaining but not life-changing. #movies':
['[CLS]', 'the', 'matrix', 'was', 'okay', ',', 'entertaining', 'but', 'not', 'life', '-', 'changing', '.', '#', 'movies', '[SEP]']
Token length 16

Tokens for 'Great visuals and action in The Matrix make it a must-watch classic. #scifi':
['[CLS]', 'great', 'visuals', 'and', 'action', 'in', 'the', 'matrix', 'make', 'it', 'a', 'must', '-', 'watch', 'classic', '.', '#'

In [34]:
# Step 2: Token Embeddings - Generate embeddings for all texts
all_embeddings = []
for text in texts[:5]:  # Show first 5 for brevity
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)  # Tokenize
    with torch.no_grad():                                                        # Frozen BERT
        outputs = model(**inputs)                                                # Get embeddings
    embeddings = outputs.last_hidden_state[0]                                     # Extract vectors
    all_embeddings.append(embeddings)
    print(f"\nEmbeddings for '{text}' (first token, 5 numbers):")
    print(embeddings[1][:5].numpy())


Embeddings for 'The Matrix is great, revolutionary sci-fi that redefined action films! #mindblown' (first token, 5 numbers):
[ 0.2202626  -0.18178469 -0.46809724  0.1393926   0.39181736]

Embeddings for 'Terrible movie, The Matrix’s plot is so confusing and overrated. #disappointed' (first token, 5 numbers):
[0.7884245  0.652363   0.05890564 0.18900512 0.04291685]

Embeddings for 'The Matrix was okay, entertaining but not life-changing. #movies' (first token, 5 numbers):
[ 0.16382633 -0.20111704 -0.42153656  0.16307226 -0.13568835]

Embeddings for 'Great visuals and action in The Matrix make it a must-watch classic. #scifi' (first token, 5 numbers):
[ 0.5706272   0.07817388 -0.06764057  0.08270969  0.17585659]

Embeddings for 'Hated The Matrix; terrible pacing and a story that drags on forever. #fail' (first token, 5 numbers):
[ 0.57143813  0.5018263   0.7289898  -0.03643154 -0.18432716]


In [35]:
all_embeddings[1].shape

torch.Size([19, 768])

In [36]:
# Step 3: Sentiment Prediction - Train custom layer on frozen BERT embeddings
# Custom classifier model
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim=768, num_classes=3):
        super(SentimentClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)  # Single dense layer
        self.softmax = nn.Softmax(dim=1) # each column adds to 1

    def forward(self, x):
        x = self.fc(x)
        x = self.softmax(x)
        return x

### Sentences and 3D dimension. Assume
- 3 sentences, 
- 2 words, 
- each word has 5 features, 

![shapes](https://www.tensorflow.org/static/guide/images/tensor/3-axis_front.png)

#### What is dimension of sentence embeddings?
- (3,5)

`nn.mean(data, dim=1)`

In [None]:
# Batch all phrases together
inputs = tokenizer(
    list(df_reviews['phrase']),  # all texts at once
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128
)

with torch.no_grad():
    outputs = model(**inputs)

# outputs.last_hidden_state: (batch_size, seq_len, hidden_dim)
# Mean-pool over tokens (dim=1)
review_embeddings = torch.mean(outputs.last_hidden_state, dim=1)  # (batch_size, 768)

# Convert labels to tensor
review_labels = torch.tensor(labels, dtype=torch.long)


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# Split data into train and test sets
train_emb, test_emb, train_labels, test_labels, train_texts, test_texts = train_test_split(
    review_embeddings, review_labels, df_reviews['phrase'].tolist(),
    test_size=0.2, random_state=42
)

# Initialize custom classifier
classifier = SentimentClassifier()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10
classifier.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = classifier(train_emb)  # Forward pass
    loss = criterion(outputs, train_labels)  # Compute loss
    loss.backward()  # Backpropagate
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# Predict sentiments for test set
classifier.eval()
with torch.no_grad():
    test_outputs = classifier(test_emb)
    y_pred = torch.argmax(test_outputs, dim=1).numpy()

# Map numeric labels back to text
label_map = {1: 'positive', 0: 'negative', 2: 'neutral'}
y_test_text = [label_map[y.item()] for y in test_labels]
y_pred_text = [label_map[y] for y in y_pred]

# Print prediction results
print("\nSentiment Prediction Results (Test Set):")
print("ID | Review Text                              | Actual    | Predicted")
print("---|-----------------------------------------|-----------|----------")
test_indices = df_reviews.index[df_reviews['phrase'].isin(test_texts)].tolist()
for idx, actual, pred, text in zip(test_indices, y_test_text, y_pred_text, test_texts):
    print(f"{idx+1:<2} | {text:<40} | {actual:<9} | {pred}")

Epoch 1, Loss: 1.1128
Epoch 2, Loss: 1.0926
Epoch 3, Loss: 1.0726
Epoch 4, Loss: 1.0530
Epoch 5, Loss: 1.0337
Epoch 6, Loss: 1.0149
Epoch 7, Loss: 0.9966
Epoch 8, Loss: 0.9793
Epoch 9, Loss: 0.9629
Epoch 10, Loss: 0.9476

Sentiment Prediction Results (Test Set):
ID | Review Text                              | Actual    | Predicted
---|-----------------------------------------|-----------|----------
5  | Watched The Matrix, it’s fine, nothing special. #cinema | neutral   | positive
13 | The Matrix is awesome, iconic and thrilling! #movies | positive  | positive
20 | The Matrix is terrible, overly complicated and dull. #disappointed | negative  | negative
25 | Great performances, The Matrix is a sci-fi triumph! #scifi | positive  | positive
26 | Terrible pacing, The Matrix drags in the middle. #boring | negative  | negative
27 | Saw The Matrix, neutral, it’s alright. #film | neutral   | positive
28 | The Matrix is fine, good action but confusing plot. #cinema | neutral   | positive
38 | 