In [1]:
import torch, torchtext
from torchtext.functional import to_tensor
from torchtext.transforms import SentencePieceTokenizer
xlmr_base = torchtext.models.XLMR_BASE_ENCODER
model = xlmr_base.get_model()
transform = xlmr_base.transform()
input_batch = ["Hello world", "How are you!"]
model_input = to_tensor(transform(input_batch), padding_value=1)
output = model(model_input)
output.shape

torch.Size([2, 6, 768])

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt
import numpy as np

from tqdm import tqdm

from models import TextClassificationModel, zip_ssl
from datasets import AGNEWS
from utils import train, eval, pearson, acc, nomean, pearson_delta

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
EPOCHS = 30
LR = 1e-3
BATCH_SIZE = 2048
EMBED_DIM = 64

In [5]:
ds = AGNEWS()

In [6]:
criterion = nn.MSELoss()
num_classes = 1
LR=1e-5

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaModel
# from utils import train

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')
roberta_model.to(device)

# Define the optimizer for the whole model (both RoBERTa and the classification head)
optimizer = optim.AdamW(list(roberta_model.parameters()), lr=LR)

# Function to get RoBERTa embeddings for a list of sentences
def model(sentences):
    # Tokenize the sentences
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the device

    # Get the embeddings (without using torch.no_grad())
    outputs = roberta_model(**inputs, output_hidden_states=True)
    del inputs
    torch.cuda.empty_cache()
    embeddings = outputs.hidden_states[-1]  # Last layer embeddings

    # Average the embeddings along the max_sequence_length dimension
    embeddings_avg = embeddings.mean(dim=1)

    return embeddings_avg

# Assuming you have defined the AGNEWS class and its methods correctly
ds = AGNEWS()
BATCH_SIZE = 16
train_loader, val_loader, test_loader = ds.loader(BATCH_SIZE, ssl=True)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def pearson(delta, ncd):
    vx = delta - torch.mean(delta)
    vy = ncd - torch.mean(ncd)

    return F.cosine_similarity(vx, vy, dim = 0)

In [57]:
from torch.functional import F
from tqdm import tqdm
def train(model, loader, criterion, optimizer, ssl = False):
    cost = 0
    corr = 0
    # for idx, (label, text, offsets, classes, ncd) in tqdm(enumerate(loader)):
    idx_batch = np.arange(loader.batch_size)
    for idx, (label, text, offsets, classes, ncd) in (enumerate(loader)):
        if ssl:
            # np.random.shuffle(idx_batch)
            # btsz = len(offsets) // 2
            h = model(label)
            delta = F.pairwise_distance(h[:btsz], h[btsz:]) / 10
            # delta = F.pairwise_distance(h[idx_batch[:btsz]], h[idx_batch[btsz:]]) / 10
            # print(idx, delta.mean())
            loss = criterion(delta, ncd)
        else:
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)

        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        cost += loss.item()
        corr += pearson(delta, ncd).cpu()
        print(delta.mean(), loss.item(), pearson(delta, ncd).cpu())
    return cost / loader.dataset.__len__(), corr / loader.dataset.__len__()

In [9]:
# maiorpossivel = 0
# for idx, (label, text, offsets, classes, ncd) in tqdm(enumerate(train_loader)):
#     btsz = len(offsets) // 2
#     h = model(label)
#     delta = F.pairwise_distance(h[:btsz], h[btsz:])
#     if maiorpossivel < delta.max().cpu():
#         maiorpossivel = delta.max().cpu()

In [58]:
loss_train, corr_train = train(model, train_loader, criterion, optimizer, ssl = True)

tensor(0.3402, device='cuda:0', grad_fn=<MeanBackward0>) 0.0003534742572810501 tensor(0.8724, grad_fn=<ToCopyBackward0>)
tensor(0.3202, device='cuda:0', grad_fn=<MeanBackward0>) 0.0005003104452043772 tensor(0.8751, grad_fn=<ToCopyBackward0>)
tensor(0.3263, device='cuda:0', grad_fn=<MeanBackward0>) 0.0002805616823025048 tensor(0.9579, grad_fn=<ToCopyBackward0>)
tensor(0.3068, device='cuda:0', grad_fn=<MeanBackward0>) 0.00202717212960124 tensor(0.8139, grad_fn=<ToCopyBackward0>)
tensor(0.3167, device='cuda:0', grad_fn=<MeanBackward0>) 0.0002652944822330028 tensor(0.9770, grad_fn=<ToCopyBackward0>)
tensor(0.3144, device='cuda:0', grad_fn=<MeanBackward0>) 0.005711662117391825 tensor(0.8773, grad_fn=<ToCopyBackward0>)
tensor(0.2996, device='cuda:0', grad_fn=<MeanBackward0>) 0.0002805099938996136 tensor(0.9108, grad_fn=<ToCopyBackward0>)
tensor(0.3239, device='cuda:0', grad_fn=<MeanBackward0>) 0.0008652654942125082 tensor(0.7735, grad_fn=<ToCopyBackward0>)
tensor(0.2942, device='cuda:0', gra

KeyboardInterrupt: 

In [None]:
loss_train, corr_train

In [40]:
D = torch.empty(size = [0])
NCD = torch.empty(size = [0])

with torch.no_grad():
    for idx, (label, text, offsets, classes, ncd) in tqdm(enumerate(test_loader)):
        btsz = len(offsets) // 2
        h = model(label)
        delta = F.pairwise_distance(h[:btsz], h[btsz:]) / 10
        D = torch.cat((D, delta.cpu()))
        NCD = torch.cat((NCD, ncd.cpu()))

152it [00:05, 27.99it/s]


KeyboardInterrupt: 

In [42]:
delta, ncd

(tensor([0.3380, 0.3194, 0.3309, 0.3197, 0.3222, 0.3132, 0.3106, 0.3014],
        device='cuda:0'),
 tensor([0.3371, 0.3243, 0.3650, 0.2964, 0.3482, 0.3010, 0.3021, 0.2563],
        device='cuda:0'))

In [48]:
pearson(delta, ncd)

tensor(0.8399, device='cuda:0')

In [None]:
plt.scatter(D, NCD);

In [None]:
# for i in roberta_model.parameters():
#     print (i)

In [None]:
# for i in roberta_model.parameters():
#     print (i)

# Test

In [None]:
import torch
import torchtext
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)

# Pass the inputs through the RoBERTa model
outputs = model(**inputs)

sentence_embeddings = outputs.last_hidden_state[:, 0, :]

# You can also take the average of all the token embeddings to get sentence embeddings
# sentence_embeddings = outputs.last_hidden_state.mean(dim=1)

sentences = ["This is a sample sentence.", "Another sentence goes here.", "And one more sentence."]

# Get the sentence embeddings for the batch of sentences
sentence_embeddings = get_sentence_embeddings(sentences)

print(sentence_embeddings.shape)  # Should print: torch.Size([3, 768]) for 3 sentences in the batch with RoBERTa-base


In [None]:
# Fine-tune the RoBERTa model and get the embeddings for each sentence in each batch
#for batch in train_loader:  # You can change this to val_loader or test_loader if needed
#    raw_list, _, _, _ = batch  # Use only the raw texts (list of sentences)
#
#    # Get RoBERTa embeddings for the sentences in the batch
#    embeddings = get_roberta_embeddings(raw_list)
#    
#
#
#
#    # Perform your downstream task using the embeddings
#    # For example, if you have a classification head, you can do the following:
#    #logits = classification_head(embeddings)
#
#    # Detach logits from the computation graph to avoid backpropagating through them
#    #logits_detach = logits.detach()
#    target_labels = torch.randn(embeddings.shape).to(device)  # Replace with your actual target labels
#    
#    # Compute the loss (you need to define your own loss function based on your task)
#    # For example, if it's a classification task, you can use CrossEntropyLoss:
#    loss = criterion(embeddings, target_labels)  # Replace target_labels with your actual target labels
#
#    # Perform backward pass and update the weights
#    optimizer.zero_grad()
#    loss.backward()
#    optimizer.step()
#    break
#
#    # Now 'embeddings' contains the embeddings of each sentence in the batch.
#    # The shape of 'embeddings' will be (num_sentences_in_batch, embedding_size).
#    # You can further process these embeddings or use them for your fine-tuning task.
