In [2]:
from preprocess_functions import *
import torch

# PARAMETRY
current_dir = os.getcwd()
INPUT_FILE = os.path.join(current_dir, "sentiment.txt")
OUTPUT_DIR = current_dir

In [3]:
X_text , Y = get_data(INPUT_FILE)
token_idx,num_tokens = tokenize_text(X_text)
max_tokens = get_max(X_text)

input_sequences = create_sequences(X_text, token_idx,max_tokens)
input_sequences, type(input_sequences)

(array([[  0,   0,   0, ...,  91, 120,  70],
        [  0,   0,   0, ...,  91, 120,  70],
        [  0,   0,   0, ...,  15,  20, 286],
        ...,
        [  0,   0,   0, ..., 360, 190,  69],
        [  0,   0,   0, ..., 351, 363, 428],
        [  0,   0,   0, ...,  91, 120,  70]]),
 numpy.ndarray)

In [4]:
input_sequences = torch.from_numpy(input_sequences)
input_sequences, type(input_sequences)

(tensor([[  0,   0,   0,  ...,  91, 120,  70],
         [  0,   0,   0,  ...,  91, 120,  70],
         [  0,   0,   0,  ...,  15,  20, 286],
         ...,
         [  0,   0,   0,  ..., 360, 190,  69],
         [  0,   0,   0,  ..., 351, 363, 428],
         [  0,   0,   0,  ...,  91, 120,  70]], dtype=torch.int32),
 torch.Tensor)

In [5]:
input_sequences.shape

torch.Size([7086, 40])

In [24]:
labels = np.array(Y)
labels = torch.tensor(labels).float()#.unsqueeze(1).float()
labels.shape

torch.Size([7086])

In [25]:
input_sequences[1], labels[1]

(tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0, 202,  91, 120,  70],
        dtype=torch.int32),
 tensor(1.))

In [26]:
X_text[1] , Y[1] 

('I love Brokeback Mountain....', 1)

In [59]:
import torch.nn as nn
import torch.optim as optim


class FeedForward(nn.Module):
    def __init__(self, num_tokens, max_tokens):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=num_tokens, embedding_dim=8, max_norm=max_tokens)
        self.gru1 = nn.GRU(input_size=8, hidden_size=16, batch_first = True)
        self.gru2 = nn.GRU(input_size=16, hidden_size=8, batch_first = True)
        self.gru3 = nn.GRU(input_size=8, hidden_size=4, batch_first = True)
        self.linear = nn.Linear(in_features=4, out_features=1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru1(x)
        x, _ = self.gru2(x)
        x, _ = self.gru3(x)
        x = self.linear(x[:, -1, :])
        x = self.sigmoid(x)
        return x

In [60]:
model = FeedForward(num_tokens, max_tokens)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

In [61]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_sequences, labels, test_size=0.33, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([4747, 40]),
 torch.Size([2339, 40]),
 torch.Size([4747]),
 torch.Size([2339]))

In [65]:
torch.manual_seed(42)

epochs = 100

for epoch in range(epochs):
    model.train()

    y_logits = model(X_train).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device 
    y_pred = torch.round(y_logits) # turn logits -> pred probs -> pred labls
    loss = criterion(y_logits, y_train) 
    acc = accuracy_fn(y_true=y_train, y_pred=y_pred)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    ### Testing
    model.eval()
    with torch.inference_mode():
        test_logits = model(X_test).squeeze() 
        test_pred = torch.round(test_logits)
        test_loss = criterion(test_logits,y_test)
        test_acc = accuracy_fn(y_true=y_test,y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.19963, Accuracy: 97.79% | Test loss: 0.22067, Test acc: 96.24%
Epoch: 10 | Loss: 0.18643, Accuracy: 97.96% | Test loss: 0.20797, Test acc: 96.45%
Epoch: 20 | Loss: 0.17498, Accuracy: 98.08% | Test loss: 0.19802, Test acc: 96.58%
Epoch: 30 | Loss: 0.16491, Accuracy: 98.29% | Test loss: 0.18898, Test acc: 96.84%
Epoch: 40 | Loss: 0.15489, Accuracy: 98.48% | Test loss: 0.18167, Test acc: 96.84%
Epoch: 50 | Loss: 0.14647, Accuracy: 98.61% | Test loss: 0.17502, Test acc: 96.75%
Epoch: 60 | Loss: 0.13916, Accuracy: 98.69% | Test loss: 0.17060, Test acc: 96.75%
Epoch: 70 | Loss: 0.13265, Accuracy: 98.74% | Test loss: 0.16601, Test acc: 96.71%
Epoch: 80 | Loss: 0.12645, Accuracy: 98.82% | Test loss: 0.16222, Test acc: 96.79%
Epoch: 90 | Loss: 0.12123, Accuracy: 98.93% | Test loss: 0.15819, Test acc: 96.79%
