# Task 4
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Add any other imports you need here
from transformers import AlbertTokenizer, AlbertModel
from pathlib import Path
import torch.nn.functional as F

# Set random seeds for reproducibility
rseed = 42
torch.manual_seed(rseed)
np.random.seed(rseed)
torch.cuda.manual_seed_all(rseed)

Depending on your approach, you might need to adapt the structure of this template or parts not marked by TODOs.
It is not necessary to completely follow this template. Feel free to add more code and delete any parts that are not required.

In [6]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

BATCH_SIZE = 256  # TODO: Set the batch size according to both training performance and available memory
NUM_EPOCHS = 100  # TODO: Set the number of epochs

train_val = pd.read_csv("train.csv")
test_val = pd.read_csv("test_no_score.csv")

cuda:0


In [7]:
# TODO: Fill out ReviewDataset
class ReviewDataset(Dataset):
    def __init__(self, data_frame):
        if "score" in data_frame.columns.to_list():   
            self.labels = data_frame["score"].to_list()
            t_emb_path = Path('title_embeddings_train.npy')
            s_emb_path = Path('sentence_embeddings_train.npy')
        else:
            self.labels = None
            t_emb_path = Path('title_embeddings_test.npy')
            s_emb_path = Path('sentence_embeddings_test.npy')
            
        if t_emb_path.exists() and s_emb_path.exists():
            # REMINDER: delete the files if you changed the else code!!
            print("Files already exist. Loading them ...")
            title_embeddings = np.load(t_emb_path)
            sentence_embeddings = np.load(s_emb_path)
        else:
            print("Files don't exist. Creating them ...")
            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
            model = AlbertModel.from_pretrained('albert-base-v2')
            model = model.to(DEVICE)
            title_embeddings = list()
            sentence_embeddings = list()
            for t, s in zip(data_frame["title"], data_frame["sentence"]):
                title_token = tokenizer(t, return_tensors="pt")
                title_token.to(DEVICE)
                sentence_token = tokenizer(s, return_tensors="pt")
                sentence_token.to(DEVICE)
                with torch.no_grad():
                    t_emb = (model(**title_token)).pooler_output
                    title_embeddings.append(t_emb.reshape(t_emb.shape[0], -1).cpu().numpy())
                    s_emb = (model(**sentence_token)).pooler_output
                    sentence_embeddings.append(s_emb.reshape(s_emb.shape[0], -1).cpu().numpy())
            title_embeddings = np.concatenate(title_embeddings)
            sentence_embeddings = np.concatenate(sentence_embeddings)
            np.save(t_emb_path, title_embeddings)
            np.save(s_emb_path, sentence_embeddings)
            print("Files successfully saved!")
        self.embeddings = list()
        for t_emb,s_emb in zip(title_embeddings, sentence_embeddings):
            self.embeddings.append(torch.from_numpy(np.hstack([t_emb,s_emb])))
            
    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, index):
        if self.labels is not None:
            return self.embeddings[index],self.labels[index]
        return self.embeddings[index]

In [8]:
train_dataset = ReviewDataset(train_val)
test_dataset = ReviewDataset(test_val)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=1, pin_memory=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False, num_workers=1, pin_memory=True)
# Additional code if needed

Files don't exist. Creating them ...
Files successfully saved!
Files don't exist. Creating them ...
Files successfully saved!


In [9]:
# TODO: Fill out MyModule
class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(1536, 480)
        self.fc2 = nn.Linear(480, 60)
        self.fc3 = nn.Linear(60, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.leaky_relu(x)
        x = self.fc2(x)
        x = F.leaky_relu(x)
        x = self.fc3(x)
        return torch.sigmoid(x) * 10


model = MyModule().to(DEVICE)

In [10]:
# TODO: Setup loss function, optimiser, and scheduler
criterion = torch.nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.002)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimiser, gamma= 0.95)
model.train()

for epoch in range(NUM_EPOCHS):
    # Training
    model.train()
    for batch in tqdm(train_loader, total=len(train_loader)):
        X, y = batch
        X = X.to(DEVICE)
        y = y.to(DEVICE)
        optimiser.zero_grad()
        outputs = model(X).squeeze()
        loss = criterion(outputs, y.float())
        loss.backward()
        optimiser.step()
        
    scheduler.step()


100%|██████████| 49/49 [00:00<00:00, 84.81it/s] 
100%|██████████| 49/49 [00:00<00:00, 181.91it/s]
100%|██████████| 49/49 [00:00<00:00, 180.39it/s]
100%|██████████| 49/49 [00:00<00:00, 184.15it/s]
100%|██████████| 49/49 [00:00<00:00, 183.10it/s]
100%|██████████| 49/49 [00:00<00:00, 182.89it/s]
100%|██████████| 49/49 [00:00<00:00, 182.95it/s]
100%|██████████| 49/49 [00:00<00:00, 183.04it/s]
100%|██████████| 49/49 [00:00<00:00, 186.98it/s]
100%|██████████| 49/49 [00:00<00:00, 184.87it/s]
100%|██████████| 49/49 [00:00<00:00, 182.78it/s]
100%|██████████| 49/49 [00:00<00:00, 183.38it/s]
100%|██████████| 49/49 [00:00<00:00, 185.52it/s]
100%|██████████| 49/49 [00:00<00:00, 182.49it/s]
100%|██████████| 49/49 [00:00<00:00, 187.68it/s]
100%|██████████| 49/49 [00:00<00:00, 175.20it/s]
100%|██████████| 49/49 [00:00<00:00, 169.00it/s]
100%|██████████| 49/49 [00:00<00:00, 172.80it/s]
100%|██████████| 49/49 [00:00<00:00, 176.03it/s]
100%|██████████| 49/49 [00:00<00:00, 173.94it/s]
100%|██████████| 49/

In [11]:
model.eval()
with torch.no_grad():
    results = []
    for batch in tqdm(test_loader, total=len(test_loader)):
        batch = batch.to(DEVICE)
        results.append(model(batch).cpu().numpy().squeeze())
 
    with open("result.txt", "w") as f:
        for val in np.concatenate(results):
            f.write(f"{val}\n")
    print("Results saved!")

100%|██████████| 4/4 [00:00<00:00, 71.25it/s]

Results saved!



