First, we allow Google Collab access to our drive so we can access the Kaggle data.

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Next, we use the provided code to actually load in our data.

In [48]:
import numpy as np

def load_npz(file_path):
    with np.load(file_path) as data:
        return {key: data[key] for key in data}

train_data = load_npz('/content/drive/MyDrive/movie-review-preference-analysis/train.npz')
test_data = load_npz('/content/drive/MyDrive/movie-review-preference-analysis/test.npz')
train_emb1, train_emb2, train_labels = train_data['emb1'], train_data['emb2'], train_data['preference']
test_emb1, test_emb2 = test_data['emb1'], test_data['emb2']

print(test_emb1.shape)
print(train_emb1.shape)

(6250, 384)
(18750, 384)


Now, we want to actually load in our model. We will first try a simple Linear model with Leaky Relu passes, which is implemented here.

In [55]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.leaky_relu(self.fc3(F.leaky_relu(self.fc2(F.leaky_relu(self.fc1(x), 1)),1)),1)
        return x

Next, we need to train our model, we use the training loop from P8.

In [63]:
import torch.optim as optim

def train_classification_model(train_loader, model, loss_fn, num_epochs, lr=1e-2):
  """Train loop for a neural network model. Please use the SGD optimizer, optim.SGD.

  Input:
      train_loader:    Data loader for the train set.
                        Enumerate through to train with each batch.
      model:           nn.Model to be trained
      num_epochs:      number of epochs to train the model for
      lr:              learning rate for the optimizer
      print_freq:      frequency to display the loss

  Output:
      model:   nn.Module trained model
  """
  optimizer = optim.SGD(model.parameters(), lr=lr)  # create an SGD optimizer for the model parameters

  for epoch in range(num_epochs):
    # Iterate through the dataloader for each epoch
    for index, (batch, labels) in enumerate(train_loader):
      # batch (torch.Tensor):    batch of r0 and r1
      # labels (torch.Tensor):  batch labels corresponding to the inputs

      # Implement the training loop using batch, labels, and cross entropy loss
      optimizer.zero_grad()
      preds = model(batch).view(len(labels))

      loss = loss_fn(preds, labels)
      loss.backward()

      optimizer.step()

    print("Loss at Epoch ", epoch, ": ", loss.item())

  return model  # return trained model

Finally, let's load our data and train our model! We normalize our data before training, as well as duplicating our data and re-ordering it with corresponding labels to add more training points.

In [87]:
import torch.utils.data as data

n,d = train_emb1.shape
print(n,d)

print(train_emb1.shape)
print(train_labels.shape)

train_emb1_tensor = torch.Tensor(np.concatenate((train_emb1, train_emb2), 0))
train_emb2_tensor = torch.Tensor(np.concatenate((train_emb2, train_emb1), 0))
train_emb_combined = torch.cat((train_emb1_tensor, train_emb2_tensor), 1)
train_emb_combined_normalized = F.normalize(train_emb_combined)

train_label_tensor = torch.Tensor(np.concatenate((train_labels, (train_labels == 0) * 1)))

print(train_emb_combined.shape)
print(train_label_tensor.shape)

loader = data.DataLoader(data.TensorDataset(train_emb_combined_normalized, train_label_tensor), shuffle=True, batch_size=7000)

loss = nn.BCEWithLogitsLoss()

lstm_model = Model(d * 2, 70, 1)
lstm_model = train_classification_model(loader, lstm_model, loss, 1000, lr=0.05)

18750 384
(18750, 384)
(18750,)
torch.Size([37500, 768])
torch.Size([37500])
Loss at Epoch  0 :  0.6939914226531982
Loss at Epoch  1 :  0.6948995590209961
Loss at Epoch  2 :  0.6927148103713989
Loss at Epoch  3 :  0.6943604350090027
Loss at Epoch  4 :  0.6939060091972351
Loss at Epoch  5 :  0.6936665177345276
Loss at Epoch  6 :  0.6928666830062866
Loss at Epoch  7 :  0.6933032870292664
Loss at Epoch  8 :  0.6931398510932922
Loss at Epoch  9 :  0.6928767561912537
Loss at Epoch  10 :  0.6933670043945312
Loss at Epoch  11 :  0.6929283738136292
Loss at Epoch  12 :  0.6923529505729675
Loss at Epoch  13 :  0.6926950216293335
Loss at Epoch  14 :  0.6926024556159973
Loss at Epoch  15 :  0.69264155626297
Loss at Epoch  16 :  0.692297637462616
Loss at Epoch  17 :  0.6921402215957642
Loss at Epoch  18 :  0.6922172904014587
Loss at Epoch  19 :  0.6921641826629639
Loss at Epoch  20 :  0.692075252532959
Loss at Epoch  21 :  0.6918979287147522
Loss at Epoch  22 :  0.6920588612556458
Loss at Epoch  23

Now, we compute the accuracy of our model. This is copied from P8.

In [88]:
def test_classification_model(test_loader, model):
    """Tests the accuracy of the model.

    Input:
        test_loader:      Data loader for the test set.
                          Enumerate through to test each example.
        model:            nn.Module model being evaluate.

    Output:
        accuracy:         Accuracy of the model on the test set.
    """
    # Compute the model accuracy
    total_batches = 0
    classified_batches = 0

    for index, (batch, labels) in enumerate(test_loader):
        preds = ((model(batch) >= 0.5) * 1).view(len(labels))
        total_batches += len(labels)

        print(preds)
        print(labels)
        print(preds == labels)

        classified_batches += torch.sum(preds == labels).item()

    return classified_batches / total_batches

print(test_classification_model(loader, lstm_model))

tensor([1, 0, 0,  ..., 1, 0, 1])
tensor([1., 0., 0.,  ..., 1., 0., 1.])
tensor([True, True, True,  ..., True, True, True])
tensor([1, 1, 1,  ..., 0, 0, 1])
tensor([1., 1., 1.,  ..., 0., 0., 1.])
tensor([True, True, True,  ..., True, True, True])
tensor([0, 1, 0,  ..., 0, 0, 0])
tensor([1., 1., 0.,  ..., 0., 0., 1.])
tensor([False,  True,  True,  ...,  True,  True, False])
tensor([1, 1, 0,  ..., 1, 1, 0])
tensor([1., 1., 0.,  ..., 1., 1., 0.])
tensor([True, True, True,  ..., True, True, True])
tensor([0, 0, 1,  ..., 1, 0, 1])
tensor([0., 0., 1.,  ..., 1., 0., 1.])
tensor([True, True, True,  ..., True, True, True])
tensor([0, 0, 1,  ..., 0, 1, 1])
tensor([0., 0., 1.,  ..., 0., 1., 1.])
tensor([True, True, True,  ..., True, True, True])
0.89488


Finally, it's time to see what our predictions are for the test data, which we save in a CSV file with the corresponding UID's and predictions.

In [89]:
import pandas as pd
from datetime import datetime

test_emb1_tensor = torch.Tensor(test_emb1)
test_emb2_tensor = torch.Tensor(test_emb2)
test_emb_combined = torch.cat((test_emb1_tensor, test_emb2_tensor), 1)
test_emb_combined_normalized = F.normalize(test_emb_combined)

uids = np.arange(18750,25000,1)
print(uids.shape)

test_predictions = ((lstm_model(test_emb_combined_normalized) >= 0.5) * 1).detach().numpy().reshape(-1)
print(test_predictions.shape)

df = pd.DataFrame({ 'uid' : uids,
    'preference' : test_predictions})
print(df)

now = datetime.now()
df.to_csv('/content/drive/My Drive/mydata' + now.strftime("%H:%M:%S") + '.csv', index=False)

(6250,)
(6250,)
        uid  preference
0     18750           0
1     18751           0
2     18752           1
3     18753           0
4     18754           1
...     ...         ...
6245  24995           1
6246  24996           0
6247  24997           1
6248  24998           0
6249  24999           0

[6250 rows x 2 columns]
