# Project 4

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import AlbertConfig, AlbertTokenizer, AlbertModel

In [2]:
DEVICE = torch.device("mps")
BATCH_SIZE = 20
NUM_EPOCHS = 50
train_val = pd.read_csv("train.csv")
test_val = pd.read_csv("test_no_score.csv")

#### Loading ALBERT transformer

In [3]:
model_config = AlbertConfig.from_pretrained(pretrained_model_name_or_path="albert-xxlarge-v2")
tokenizer = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path="albert-xxlarge-v2")
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
model = AlbertModel.from_pretrained(pretrained_model_name_or_path="albert-xxlarge-v2", config=model_config)
model.config.pad_token_id = model.config.eos_token_id
model.to(DEVICE)
print('Model loaded to `%s`'%DEVICE)

Model loaded to `mps`


#### Preparing text data

In [4]:
class ReviewDataset(Dataset):
    def __init__(self, data_frame):
        self.text = data_frame["title"].str.cat(data_frame["sentence"], sep = " ")

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        return {"text": self.text[index]}

In [5]:
train_dataset = ReviewDataset(train_val)
print("Created train_dataset with %d samples" %len(train_dataset))
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)
print("Created train_loader with %d batches" %len(train_loader))
print()
test_dataset = ReviewDataset(test_val)
print("Created test_dataset with %d samples" %len(test_dataset))
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)
print("Created test_loader with %d batches" %len(test_loader))

Created train_dataset with 12500 samples
Created train_loader with 625 batches

Created test_dataset with 1000 samples
Created test_loader with 50 batches


#### Extracting text embeddings

In [6]:
embedding_size = 4096
embeddings_test = np.zeros((len(test_dataset), embedding_size))
i = 0
with torch.no_grad():
    model.eval()
    for batch in tqdm(test_loader, total=len(test_loader)):
        data = list(batch.values())
        encoded = tokenizer(*data, return_tensors="pt", padding=True, truncation=False)
        del data
        encoded.to(DEVICE)
        outputs = model(**encoded)
        word_embeddings = outputs.last_hidden_state * encoded.attention_mask.unsqueeze(-1).float()
        del outputs
        sentence_embeddings = word_embeddings.sum(dim=1)
        del word_embeddings
        sentence_embeddings /= encoded.attention_mask.sum(dim=1, keepdim=True).float()
        del encoded
        embeddings_test[BATCH_SIZE*i : BATCH_SIZE*(i+1)] = sentence_embeddings.cpu().numpy()
        del sentence_embeddings
        i += 1
np.save("embeddings_test.npy", embeddings_test)

100%|██████████| 50/50 [06:15<00:00,  7.51s/it]


In [7]:
embeddings_train = np.zeros((len(train_dataset), embedding_size))
i = 0
with torch.no_grad():
    model.eval()
    for batch in tqdm(train_loader, total=len(train_loader)):
        data = list(batch.values())
        encoded = tokenizer(*data, return_tensors="pt", padding=True, truncation=False)
        del data
        encoded.to(DEVICE)
        outputs = model(**encoded)
        word_embeddings = outputs.last_hidden_state * encoded.attention_mask.unsqueeze(-1).float()
        del outputs
        sentence_embeddings = word_embeddings.sum(dim=1)
        del word_embeddings
        sentence_embeddings /= encoded.attention_mask.sum(dim=1, keepdim=True).float()
        del encoded
        embeddings_train[BATCH_SIZE*i : BATCH_SIZE*(i+1)] = sentence_embeddings.cpu().numpy()
        del sentence_embeddings
        i += 1
np.save("embeddings_train.npy", embeddings_train)

100%|██████████| 625/625 [1:00:39<00:00,  5.82s/it]


#### Preparing training data

In [8]:
def create_loader_from_np(X, y=None, train=True, batch_size=BATCH_SIZE, shuffle=True, num_workers=10):
    if train:
        dataset = TensorDataset(torch.from_numpy(X).type(torch.float), torch.from_numpy(y).type(torch.float))
    else:
        dataset = TensorDataset(torch.from_numpy(X).type(torch.float))
    loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=True, num_workers=num_workers)
    return loader

In [9]:
embeddings_train = np.load("embeddings_train.npy")
X = embeddings_train
y = train_val["score"].to_numpy()
train_loader = create_loader_from_np(X[0:round(0.8*X.shape[0])], y[0:round(0.8*len(y))], train=True, batch_size=BATCH_SIZE)
valid_loader = create_loader_from_np(X[round(0.2*X.shape[0]):], y[round(0.2*len(y)):], train=True, batch_size=BATCH_SIZE)
train_loader_final = create_loader_from_np(X, y, train=True, batch_size=BATCH_SIZE)
del X
del y

#### Defining neural network for prediction

In [10]:
layer1_size = 500
layer2_size = 500

class Net(nn.Module):
    """
    The model class, which defines our classifier.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        self.fc1 = nn.Linear(embedding_size, layer1_size)
        self.fc2 = nn.Linear(layer1_size, layer2_size)
        self.fc3 = nn.Linear(layer2_size, 1)

    def forward(self, x):
        """
        The forward pass of the model.
        input: x: torch.Tensor, the input to the model
        output: x: torch.Tensor, the output of the model
        """
        x = self.fc1(x)
        x = F.elu(x)
        x = self.fc2(x)
        x = F.elu(x)
        x = self.fc3(x)
        return x

#### Training and evaluating neural network

In [None]:
model = Net()
model.train()
model.to(DEVICE)

n_epochs = NUM_EPOCHS
loss_function = nn.MSELoss(reduction="sum")
learn_rate = 0.0003
optimizer = optim.Adam(model.parameters(), lr=learn_rate)

for epoch in range(n_epochs):

    train_loss = 0.0
    for batch_id, (X, y) in enumerate(train_loader):
        model.train()
        X = X.to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad()
        output = model(X)
        loss = loss_function(torch.flatten(output), y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if batch_id % 100 == 0:
            print("Epoch {}, Batch {}".format(epoch+1, batch_id), end=" -- ")
    epoch_train_loss = train_loss / len(train_loader.dataset)
    print("Epoch {}, training loss {}".format(epoch+1, epoch_train_loss))

    valid_loss = 0.0
    with torch.no_grad():
        for X, y in valid_loader: 
            model.eval()
            X = X.to(DEVICE)   
            y = y.to(DEVICE)
            output_valid = model(X)
            loss_valid = loss_function(torch.flatten(output_valid), y)
            valid_loss += loss_valid.item()
    epoch_valid_loss = valid_loss / len(valid_loader.dataset)
    print("Epoch {}, validation loss {}".format(epoch+1, epoch_valid_loss))

#### Training final neural network

In [None]:
model = Net()
model.train()
model.to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=learn_rate)
for epoch in range(n_epochs):
    train_loss = 0.0
    for batch_id, (X, y) in enumerate(train_loader_final):
        X = X.to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad()
        output = model(X)
        loss = loss_function(torch.flatten(output), y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if batch_id % 100 == 0:
            print("Epoch {}, Batch {}".format(epoch+1, batch_id), end=" -- ")
    epoch_train_loss = train_loss / len(train_loader_final.dataset)
    print("Epoch {}, training loss {}".format(epoch+1, epoch_train_loss))  

#### Making predictions on test data

In [13]:
embeddings_test = np.load("embeddings_test.npy")
X_test = embeddings_test
test_loader = create_loader_from_np(X_test, train=False, batch_size=BATCH_SIZE, shuffle=False)
del X_test

In [14]:
model.eval()
results = []
with torch.no_grad():
    for [x_batch] in test_loader:
        x_batch = x_batch.to(DEVICE)
        output = torch.flatten(model(x_batch))
        output = output.cpu().numpy()[:, np.newaxis]
        results.append(np.clip(output, a_min=0, a_max=10))
    results = np.vstack(results)

    with open("result.txt", "w") as f:
        for val in np.concatenate(results):
            f.write(f"{val}\n")
print("Results saved to results.txt")

Results saved to results.txt
