In [1]:
# Import required packages
import numpy as np
import torch
# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x2b073ab5870>

## Question 1: Create a Python list of the first 10 square numbers $(1, 4, 9, ..., 100)$. Convert this list to a NumPy array and reshape it into a $2\times5$ matrix. Print out the matrix as a 2-dimensional array. 


In [2]:
list = [x * x for x in range(1, 11)]
print(list) 

arr = np.array(list)
arr = np.reshape(arr, (2,5))
print(arr)

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
[[  1   4   9  16  25]
 [ 36  49  64  81 100]]


## PyTorch Tensors and Operations (9 pts). 
## Convert the NumPy array from Question 1 into a PyTorch tensor and perform the following operations. The following operations must be done using PyTorch interfaces. 
(a) Multiply all elements by 2 and print out the sum of all elements. (3 pts) 
(b) Create a new 5 × 2 tensor by transposing the original tensor. Print out the transposed tensor. (3 pts) 
(c) Perform matrix multiplication between the original tensor and the transposed tensor. Print out the result. (3 pts) 

In [3]:
tensor = torch.from_numpy(arr)
tensor_a = tensor * 2
print(torch.sum(tensor_a))

tensor_b = torch.transpose(tensor, 0,1)
print(tensor_b)

result = torch.matmul(tensor, tensor_b)
print(result)

tensor(770)
tensor([[  1,  36],
        [  4,  49],
        [  9,  64],
        [ 16,  81],
        [ 25, 100]], dtype=torch.int32)
tensor([[  979,  4604],
        [ 4604, 24354]], dtype=torch.int32)


In [4]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

np.random.seed(42)
torch.manual_seed(42)

def preprocess_text(text):
    """
    Function that preprocesses the string
    """
    preprocessed_text = text.lower().replace("<br />", "")
    return preprocessed_text

def read_file(file_name, label):
    """
    Function that reads a file
    and return the raw text, preprocessed text, and label
    """
    try:
        with open(file_name, "r", encoding='utf-8') as f:
            raw_text = f.read()
    except UnicodeDecodeError:
        # If UTF-8 fails, try with 'latin-1' encoding
        with open(file_name, "r", encoding='latin-1') as f:
            raw_text = f.read()
    
    preprocessed_text = preprocess_text(raw_text)
    return raw_text, preprocessed_text, label

def construct_dataset(dataset_dir):
    """
    Function that loads a dataset
    """
    pos_dir = os.path.join(dataset_dir, "pos")
    neg_dir = os.path.join(dataset_dir, "neg")
    dir_list = [neg_dir, pos_dir]
    raw_contents, contents, labels = [], [], []
    all_pos = os.listdir(pos_dir)
    all_neg = os.listdir(neg_dir)
    data_size = len(all_neg)
    for i in range(data_size):
        for lbl, dataset in enumerate([all_neg, all_pos]):
            cur_path = os.path.join(dir_list[lbl], dataset[i])
            if not os.path.isdir(cur_path):
                raw_content, content, label = read_file(cur_path, lbl)
                contents.append(content)
                raw_contents.append(raw_content)
                labels.append(label)
    return raw_contents, contents, labels

# Load training and testing data
train_dir = './aclImdb/train'
test_dir = './aclImdb/test'
sentiments = ["Negative", "Positive"]

train_raw_contents, train_contents, train_labels = construct_dataset(train_dir)
test_raw_contents, test_contents, test_labels = construct_dataset(test_dir)

# Split train data into training and validation sets
train_contents, val_contents, train_labels, val_labels = train_test_split(
    train_contents, train_labels, test_size=0.2, random_state=42)

print(f"Train size: {len(train_contents)}, "
      f"\nVal size: {len(val_contents)}, "
      f"\nTest size: {len(test_contents)}")

# show the first review and its sentiment label
print("Review: ", train_raw_contents[0])
print("Sentiment: ", sentiments[train_labels[0]])

Train size: 20000, 
Val size: 5000, 
Test size: 25000
Review:  Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.
Sentiment:  Positive


In [5]:
# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_contents).toarray()
X_val = vectorizer.transform(val_contents).toarray()
X_test = vectorizer.transform(test_contents).toarray()

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create DataLoader objects
train_dataset = IMDBDataset(X_train, y_train)
val_dataset = IMDBDataset(X_val, y_val)
test_dataset = IMDBDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## TODO: Implement a simple two-layer neural network using PyTorch's nn.Module for binary classification. The network should have:

Then initialize the model where input_dim equals the shape of the data in
X_train, the Binary Entropy loss function, and the Adam optimizer with 0.001
learning rate

In [12]:
import torch.optim as optim  # Add this import statement

# TODO: Define a simple 2-layer neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.hidden = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()                        # ReLU activation function
        self.output = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()                  # Sigmoid activation function

    def forward(self, x):
        # Forward pass through the network
        x = self.hidden(x)
        x = self.relu(x)
        x = self.output(x)
        x = self.sigmoid(x)
        return x

input_dim = X_train.shape[1]
model = SimpleNN(input_dim)
criterion = nn.BCELoss()                             # Binary entropy loss function 
optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam optimizer -> 0.001 lr



## Train the neural network for 10 epochs, and print out the training loss and accuracy on the training set at the end of each epoch.

In [13]:
## https://pytorch.org/tutorials/beginner/introyt/trainingyt.html#the-training-loop

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    # TODO: Complete the training loop, update the training loss and accuracy
    for i, (texts, labels) in enumerate(train_loader):
        batch_size = texts.size(0)
        outputs = model(texts)
        optimizer.zero_grad()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * batch_size
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum()
        
        train_loss /= len(train_loader)
        train_acc = correct / total

    print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {train_loss:.4f}, "
          f"Train Acc: {train_acc:.4f}, ")

Epoch 1/10, Train Loss: 0.0147, Train Acc: 0.8634, 
Epoch 2/10, Train Loss: 0.0072, Train Acc: 0.9292, 
Epoch 3/10, Train Loss: 0.0107, Train Acc: 0.9492, 
Epoch 4/10, Train Loss: 0.0056, Train Acc: 0.9628, 
Epoch 5/10, Train Loss: 0.0027, Train Acc: 0.9734, 
Epoch 6/10, Train Loss: 0.0002, Train Acc: 0.9854, 
Epoch 7/10, Train Loss: 0.0002, Train Acc: 0.9867, 
Epoch 8/10, Train Loss: 0.0014, Train Acc: 0.9952, 
Epoch 9/10, Train Loss: 0.0003, Train Acc: 0.9988, 
Epoch 10/10, Train Loss: 0.0001, Train Acc: 0.9998, 


In [14]:
# TODO: Evaluate the model on the test set
model.eval()
test_correct = 0
with torch.no_grad():
    for i, (texts, labels) in enumerate(test_loader):
        outputs = model(texts)
        predicted = (outputs > 0.5).float()
        test_correct += (predicted == labels).sum()

test_acc = test_correct / len(test_loader.dataset)
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.8510
