In [5]:
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
import pandas as pd
from transformers import AutoModelForSequenceClassification
from subset_active_learning.subset_classifier import *

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
config = OptimalSubsetClassifierConfig(max_length=66, debug=False, model_name="albert-base-v2", batch_size=8, max_steps=20000)

In [7]:
# read df from pickle file
df = pd.read_pickle("./results/sst_results_df.pkl")
optimal_subset_data_indices = get_optimal_subset_data_indices(df)
train_ds, valid_ds, test_ds, debug_ds = create_train_valid_test_debug_ds(optimal_subset_data_indices, config)

100%|██████████| 3/3 [00:00<00:00, 413.83it/s]
INFO:src.subset_classifier:dataset      num positive examples    num negative examples
---------  -----------------------  -----------------------
train                           77                      723
valid                            9                       91
test                            10                       90


In [8]:
def get_preds(sampling_model, original_train_ds):
    dl = DataLoader(original_train_ds, batch_size=8, shuffle=False)
    sampling_model.eval()
    sampling_model.to(device)
    with torch.no_grad():
        preds = []
        for batch in tqdm(dl):
            batch = {k: v.to(device) for k, v in batch.items()}
            out = sampling_model(**batch)
            preds.append(out.logits)
        preds = torch.cat(preds)
        preds = torch.nn.functional.softmax(preds, dim=-1)
    return preds

In [9]:
sampling_model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=2)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

In [10]:
preds = get_preds(sampling_model, train_ds)

100%|██████████| 100/100 [02:08<00:00,  1.29s/it]


In [13]:
train_ds

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

In [12]:
preds.shape

torch.Size([800, 2])

In [19]:
# select the index 1 for every sample
pos_scores = preds[:, 1]

In [24]:
top_pos_scores = torch.topk(pos_scores, k=8)

In [25]:
top_pos_scores

torch.return_types.topk(
values=tensor([0.6708, 0.6193, 0.6001, 0.5921, 0.5507, 0.5373, 0.5274, 0.5272]),
indices=tensor([331, 360, 379, 364, 734, 696, 205, 600]))

In [None]:
def subset_sampling(n_samples=8):
    preds = get_preds(sampling_model, train_ds)
    pos_scores = preds[:, 1]
    return torch.topk(pos_scores, k=n_samples).indices.tolist()