# Project 4

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
# from transformers import GPT2Config, GPT2Tokenizer, GPT2Model
from transformers import AlbertConfig, AlbertTokenizer, AlbertModel

In [2]:
DEVICE = torch.device("mps")
BATCH_SIZE = 20  # TODO: Set the batch size according to both training performance and available memory
NUM_EPOCHS = 50  # TODO: Set the number of epochs
train_val = pd.read_csv("train.csv")
test_val = pd.read_csv("test_no_score.csv")

In [3]:
print('Loading configuration...')
model_config = AlbertConfig.from_pretrained(pretrained_model_name_or_path="albert-xxlarge-v2")
print('Loading tokenizer...')
tokenizer = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path="albert-xxlarge-v2")
# default to right padding
tokenizer.padding_side = "right"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token
# Get the actual model.
print('Loading model...')
model = AlbertModel.from_pretrained(pretrained_model_name_or_path="albert-xxlarge-v2", config=model_config)
# resize model embedding to match new tokenizer
# model.resize_token_embeddings(len(tokenizer))
# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id
# Load model to defined device.
model.to(DEVICE)
print('Model loaded to `%s`'%DEVICE)

Loading configuration...


config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Loading model...


model.safetensors:   0%|          | 0.00/893M [00:00<?, ?B/s]

Model loaded to `mps`


In [4]:
# TODO: Fill out ReviewDataset
class ReviewDataset(Dataset):
    def __init__(self, data_frame):
        self.text = data_frame["title"].str.cat(data_frame["sentence"], sep = " ")

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        return {"text": self.text[index]}

# class ReviewDataset(Dataset):
#     def __init__(self, data_frame):
#         self.title = data_frame["title"]
#         self.sentence = data_frame["sentence"]

#     def __len__(self):
#         return len(self.title)

#     def __getitem__(self, index):
#         return {"title": self.title[index], "sentence": self.sentence[index]}

In [5]:
print('Dealing with Train...')
# Create pytorch dataset.
train_dataset = ReviewDataset(train_val)
print('Created `train_dataset` with %d samples'%len(train_dataset))

# Move pytorch dataset into dataloader.
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=False, num_workers=0, pin_memory=True)
print('Created `train_loader` with %d batches'%len(train_loader))

print()

print('Dealing with Test...')
# Create pytorch dataset.
test_dataset = ReviewDataset(test_val)
print('Created `test_dataset` with %d samples'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False, num_workers=0, pin_memory=True)
print('Created `test_loader` with %d batches'%len(test_loader))
# Additional code if needed
embedding_size = 4096

Dealing with Train...
Created `train_dataset` with 12500 samples
Created `train_loader` with 625 batches

Dealing with Test...
Created `test_dataset` with 1000 samples
Created `test_loader` with 50 batches


In [None]:
# embeddings_test = np.zeros((len(test_dataset), 2*embedding_size))
# i = 0
# with torch.no_grad():
#     model.eval()
#     for batch in tqdm(test_loader, total=len(test_loader)):
#         data = list(batch.values())
#         encoded_titles = tokenizer(data[0], return_tensors='pt', padding=True, truncation=False)
#         encoded_sentences = tokenizer(data[1], return_tensors='pt', padding=True, truncation=False)
#         del data
#         encoded_titles.to(DEVICE)
#         encoded_sentences.to(DEVICE)
#         outputs_titles = model(**encoded_titles)
#         outputs_sentences = model(**encoded_sentences)
#         word_embeddings_titles = outputs_titles.last_hidden_state * encoded_titles.attention_mask.unsqueeze(-1).float()
#         word_embeddings_sentences = outputs_sentences.last_hidden_state * encoded_sentences.attention_mask.unsqueeze(-1).float()
#         del outputs_titles
#         del outputs_sentences
#         sentence_embeddings_titles = word_embeddings_titles.sum(dim=1)
#         sentence_embeddings_sentences = word_embeddings_sentences.sum(dim=1)
#         del word_embeddings_titles
#         del word_embeddings_sentences
#         sentence_embeddings_titles /= encoded_titles.attention_mask.sum(dim=1, keepdim=True).float()
#         sentence_embeddings_sentences /= encoded_sentences.attention_mask.sum(dim=1, keepdim=True).float()
#         del encoded_titles
#         del encoded_sentences
#         embeddings_test[BATCH_SIZE*i : BATCH_SIZE*(i+1)] = np.hstack([sentence_embeddings_titles.cpu().numpy(), sentence_embeddings_sentences.cpu().numpy()])
#         del sentence_embeddings_titles
#         del sentence_embeddings_sentences
#         i +=1
# np.save('embeddings_test.npy', embeddings_test)

In [None]:
# embeddings_train = np.zeros((len(train_dataset), 2*embedding_size))
# i = 0
# with torch.no_grad():
#     model.eval()
#     for batch in tqdm(train_loader, total=len(train_loader)):
#         data = list(batch.values())
#         encoded_titles = tokenizer(data[0], return_tensors='pt', padding=True, truncation=False)
#         encoded_sentences = tokenizer(data[1], return_tensors='pt', padding=True, truncation=False)
#         del data
#         encoded_titles.to(DEVICE)
#         encoded_sentences.to(DEVICE)
#         outputs_titles = model(**encoded_titles)
#         outputs_sentences = model(**encoded_sentences)
#         word_embeddings_titles = outputs_titles.last_hidden_state * encoded_titles.attention_mask.unsqueeze(-1).float()
#         word_embeddings_sentences = outputs_sentences.last_hidden_state * encoded_sentences.attention_mask.unsqueeze(-1).float()
#         del outputs_titles
#         del outputs_sentences
#         sentence_embeddings_titles = word_embeddings_titles.sum(dim=1)
#         sentence_embeddings_sentences = word_embeddings_sentences.sum(dim=1)
#         del word_embeddings_titles
#         del word_embeddings_sentences
#         sentence_embeddings_titles /= encoded_titles.attention_mask.sum(dim=1, keepdim=True).float()
#         sentence_embeddings_sentences /= encoded_sentences.attention_mask.sum(dim=1, keepdim=True).float()
#         del encoded_titles
#         del encoded_sentences
#         embeddings_train[BATCH_SIZE*i : BATCH_SIZE*(i+1)] = np.hstack([sentence_embeddings_titles.cpu().numpy(), sentence_embeddings_sentences.cpu().numpy()])
#         del sentence_embeddings_titles
#         del sentence_embeddings_sentences
#         i +=1
# np.save('embeddings_train.npy', embeddings_train)

In [6]:
embeddings_test = np.zeros((len(test_dataset), embedding_size))
i = 0
with torch.no_grad():
    model.eval()
    for batch in tqdm(test_loader, total=len(test_loader)):
        data = list(batch.values())
        encoded = tokenizer(*data, return_tensors='pt', padding=True, truncation=False)
        del data
        encoded.to(DEVICE)
        outputs = model(**encoded)
        word_embeddings = outputs.last_hidden_state * encoded.attention_mask.unsqueeze(-1).float()
        del outputs
        sentence_embeddings = word_embeddings.sum(dim=1)
        del word_embeddings
        sentence_embeddings /= encoded.attention_mask.sum(dim=1, keepdim=True).float()
        del encoded
        embeddings_test[BATCH_SIZE*i : BATCH_SIZE*(i+1)] = sentence_embeddings.cpu().numpy()
        del sentence_embeddings
        i +=1
np.save('embeddings_test.npy', embeddings_test)

100%|██████████| 50/50 [06:15<00:00,  7.51s/it]


In [7]:
embeddings_train = np.zeros((len(train_dataset), embedding_size))
i = 0
with torch.no_grad():
    model.eval()
    for batch in tqdm(train_loader, total=len(train_loader)):
        data = list(batch.values())
        encoded = tokenizer(*data, return_tensors='pt', padding=True, truncation=False)
        del data
        encoded.to(DEVICE)
        outputs = model(**encoded)
        word_embeddings = outputs.last_hidden_state * encoded.attention_mask.unsqueeze(-1).float()
        del outputs
        sentence_embeddings = word_embeddings.sum(dim=1)
        del word_embeddings
        sentence_embeddings /= encoded.attention_mask.sum(dim=1, keepdim=True).float()
        del encoded
        embeddings_train[BATCH_SIZE*i : BATCH_SIZE*(i+1)] = sentence_embeddings.cpu().numpy()
        del sentence_embeddings
        i +=1
np.save('embeddings_train.npy', embeddings_train)

100%|██████████| 625/625 [1:00:39<00:00,  5.82s/it]


In [8]:
def create_loader_from_np(X, y=None, train=True, batch_size=BATCH_SIZE, shuffle=True, num_workers=10):
    if train:
        dataset = TensorDataset(torch.from_numpy(X).type(torch.float), torch.from_numpy(y).type(torch.float))
    else:
        dataset = TensorDataset(torch.from_numpy(X).type(torch.float))
    loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=True, num_workers=num_workers)
    return loader

In [9]:
embeddings_train = np.load('embeddings_train.npy')
# embeddings_train = (embeddings_train - np.mean(embeddings_train, axis=1)[:, np.newaxis]) / np.std(embeddings_train, axis=1)[:, np.newaxis]

X = embeddings_train
y = train_val["score"].to_numpy()
train_loader = create_loader_from_np(X[0:round(0.8*X.shape[0])], y[0:round(0.8*len(y))], train=True, batch_size=BATCH_SIZE)
valid_loader = create_loader_from_np(X[round(0.2*X.shape[0]):], y[round(0.2*len(y)):], train=True, batch_size=BATCH_SIZE)
train_loader_final = create_loader_from_np(X, y, train=True, batch_size=BATCH_SIZE)
del X
del y

In [10]:
# TODO: Fill out MyModule
layer1_size = 500
layer2_size = 500
dropout_prop = 0

class Net(nn.Module):
    """
    The model class, which defines our classifier.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        self.fc1 = nn.Linear(embedding_size, layer1_size)
        self.dropout1 = nn.Dropout(dropout_prop)
        self.fc2 = nn.Linear(layer1_size, layer2_size)
        self.dropout2 = nn.Dropout(dropout_prop)
        # self.fc3 = nn.Linear(layer2_size, layer3_size)
        # self.dropout3 = nn.Dropout(dropout_prop)
        self.fc3 = nn.Linear(layer2_size, 1)

    def forward(self, x):
        """
        The forward pass of the model.
        input: x: torch.Tensor, the input to the model
        output: x: torch.Tensor, the output of the model
        """
        x = self.fc1(x)
        x = F.elu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = F.elu(x)
        x = self.dropout2(x)
        # x = self.fc3(x)
        # x = F.elu(x)
        # x = self.dropout3(x)
        x = self.fc3(x)
        return x

In [11]:
model = Net()
model.train()
model.to(DEVICE)

n_epochs = 100
patience = 100
min_delta = 0.001
best_val_loss = float('inf')
epochs_no_improve = 0
loss_function = nn.MSELoss(reduction="sum")
learn_rate = 0.0003
optimizer = optim.Adam(model.parameters(), lr=learn_rate)

for epoch in range(n_epochs):

    train_loss = 0.0
    for batch_id, (X, y) in enumerate(train_loader):
        model.train()
        X = X.to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad()
        output = model(X)
        loss = loss_function(torch.flatten(output), y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if batch_id % 100 == 0:
            print('Epoch {}, Batch {}'.format(epoch+1, batch_id), end=" -- ")
    epoch_train_loss = train_loss / len(train_loader.dataset)
    print('Epoch {}, training loss {}'.format(epoch+1, epoch_train_loss))

    valid_loss = 0.0
    with torch.no_grad():
        for X, y in valid_loader: 
            model.eval()
            X = X.to(DEVICE)   
            y = y.to(DEVICE)
            output_valid = model(X)
            loss_valid = loss_function(torch.flatten(output_valid), y)
            valid_loss += loss_valid.item()
    epoch_valid_loss = valid_loss / len(valid_loader.dataset)
    print('Epoch {}, validation loss {}'.format(epoch+1, epoch_valid_loss))

    if epoch_valid_loss < best_val_loss - min_delta:
        best_val_loss = epoch_valid_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f'Early stopping after {epoch+1} epochs.')
        break

Epoch 1, Batch 0 -- Epoch 1, Batch 100 -- Epoch 1, Batch 200 -- Epoch 1, Batch 300 -- Epoch 1, Batch 400 -- Epoch 1, training loss 5.774501124382019
Epoch 1, validation loss 3.2438979597091673
Epoch 2, Batch 0 -- Epoch 2, Batch 100 -- Epoch 2, Batch 200 -- Epoch 2, Batch 300 -- Epoch 2, Batch 400 -- Epoch 2, training loss 3.2635896881103514
Epoch 2, validation loss 3.0698712236404417
Epoch 3, Batch 0 -- Epoch 3, Batch 100 -- Epoch 3, Batch 200 -- Epoch 3, Batch 300 -- Epoch 3, Batch 400 -- Epoch 3, training loss 2.7518372490882874
Epoch 3, validation loss 2.3403093088150024
Epoch 4, Batch 0 -- Epoch 4, Batch 100 -- Epoch 4, Batch 200 -- Epoch 4, Batch 300 -- Epoch 4, Batch 400 -- Epoch 4, training loss 2.567378203868866
Epoch 4, validation loss 2.2591030093193054
Epoch 5, Batch 0 -- Epoch 5, Batch 100 -- Epoch 5, Batch 200 -- Epoch 5, Batch 300 -- Epoch 5, Batch 400 -- Epoch 5, training loss 2.2264364428520205
Epoch 5, validation loss 2.2433243871688844
Epoch 6, Batch 0 -- Epoch 6, Bat

KeyboardInterrupt: 

In [12]:
model = Net()
model.train()
model.to(DEVICE)
n_epochs = 50
optimizer = optim.Adam(model.parameters(), lr=learn_rate)
for epoch in range(n_epochs):
    train_loss = 0.0
    for batch_id, (X, y) in enumerate(train_loader_final):
        X = X.to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad()
        output = model(X)
        loss = loss_function(torch.flatten(output), y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if batch_id % 100 == 0:
            print('Epoch {}, Batch {}'.format(epoch+1, batch_id), end=" -- ")
    epoch_train_loss = train_loss / len(train_loader_final.dataset)
    print('Epoch {}, training loss {}'.format(epoch+1, epoch_train_loss))  

Epoch 1, Batch 0 -- Epoch 1, Batch 100 -- Epoch 1, Batch 200 -- Epoch 1, Batch 300 -- Epoch 1, Batch 400 -- Epoch 1, Batch 500 -- Epoch 1, Batch 600 -- Epoch 1, training loss 5.362310281066894
Epoch 2, Batch 0 -- Epoch 2, Batch 100 -- Epoch 2, Batch 200 -- Epoch 2, Batch 300 -- Epoch 2, Batch 400 -- Epoch 2, Batch 500 -- Epoch 2, Batch 600 -- Epoch 2, training loss 3.082162197189331
Epoch 3, Batch 0 -- Epoch 3, Batch 100 -- Epoch 3, Batch 200 -- Epoch 3, Batch 300 -- Epoch 3, Batch 400 -- Epoch 3, Batch 500 -- Epoch 3, Batch 600 -- Epoch 3, training loss 2.6571406590270996
Epoch 4, Batch 0 -- Epoch 4, Batch 100 -- Epoch 4, Batch 200 -- Epoch 4, Batch 300 -- Epoch 4, Batch 400 -- Epoch 4, Batch 500 -- Epoch 4, Batch 600 -- Epoch 4, training loss 2.4696303201293945
Epoch 5, Batch 0 -- Epoch 5, Batch 100 -- Epoch 5, Batch 200 -- Epoch 5, Batch 300 -- Epoch 5, Batch 400 -- Epoch 5, Batch 500 -- Epoch 5, Batch 600 -- Epoch 5, training loss 2.0968327576828
Epoch 6, Batch 0 -- Epoch 6, Batch 

In [13]:
embeddings_test = np.load('embeddings_test.npy')
# embeddings_test = (embeddings_test - np.mean(embeddings_test, axis=1)[:, np.newaxis]) / np.std(embeddings_test, axis=1)[:, np.newaxis]

X_test = embeddings_test
test_loader = create_loader_from_np(X_test, train=False, batch_size=BATCH_SIZE, shuffle=False)
del X_test

In [14]:
model.eval()
results = []
with torch.no_grad():
    for [x_batch] in test_loader:
        x_batch = x_batch.to(DEVICE)
        output = torch.flatten(model(x_batch))
        output = output.cpu().numpy()[:, np.newaxis]
        results.append(np.clip(output, a_min=0, a_max=10))
    results = np.vstack(results)

    with open("result.txt", "w") as f:
        for val in np.concatenate(results):
            f.write(f"{val}\n")
print("Results saved to results.txt")

Results saved to results.txt
