In [1]:
%load_ext autoreload  
%autoreload 2 

In [2]:
from src.subset_classifier import *
import pandas as pd
from datasets import load_metric
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification
import uuid
import numpy as np
import torch
import wandb

wandb.init(project="subset_selection_active_learning", entity="johnny-gary")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# DB_PATH = "./subset_selection/sst_results.db"
# df = get_df_from_db(DB_PATH)

In [4]:
# read df from pickle file
df = pd.read_pickle("./results/sst_results_df.pkl")

In [5]:
subset_unique_counts = get_subset_unique_counts(df); print(subset_unique_counts)

Counter({96: 3164, 93: 1815, 94: 1793, 95: 876, 92: 661, 97: 262})


In [6]:
optimal_subset_data_indices = get_optimal_subset_data_indices(df)

In [7]:
config = OptimalSubsetClassifierConfig(max_length=66, debug=False, model_name="albert-base-v2", batch_size=8, max_steps=20000)

In [8]:
train_ds, valid_ds, test_ds, debug_ds = create_train_valid_test_debug_ds(optimal_subset_data_indices, config)

print(len(train_ds), len(valid_ds), len(test_ds), len(debug_ds))

100%|██████████| 3/3 [00:00<00:00, 198.90it/s]
INFO:src.subset_classifier:dataset      num positive examples    num negative examples
---------  -----------------------  -----------------------
train                           77                      723
valid                            9                       91
test                            10                       90


800 100 100 12


In [10]:
metric = load_metric("roc_auc")
def compute_metrics(self, eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return self.metric.compute(predictions=predictions, references=labels)

In [11]:
config.debug

False

In [12]:
# config.debug = True; print(config)
training_args = TrainingArguments(
            output_dir="./subset_classifier_results",
            max_steps=config.max_steps if not config.debug else 500,
            evaluation_strategy="steps",
            report_to="wandb", 
            run_name=f"subset_classifier_result_{uuid.uuid4().hex}",
            logging_steps=300,
            eval_steps=300,
            learning_rate=1e-5,
            adam_epsilon=1e-6,
            warmup_ratio=0.1,
            weight_decay=0.01,
            load_best_model_at_end = True,
            metric_for_best_model = 'f1'
        )

In [14]:
import torch
print(training_args.run_name, training_args.max_steps, training_args.report_to, torch.cuda.is_available(), config)

subset_classifier_result_f6f0c318b21e4216a86c663ba141dd1e 20000 ['wandb'] True OptimalSubsetClassifierConfig(max_length=66, debug=False, model_name='albert-base-v2', batch_size=8, max_steps=20000)


In [15]:
model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=2)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds if not config.debug else debug_ds,
    eval_dataset=valid_ds if not config.debug else debug_ds,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.bias', 'predictions.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [None]:
trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 200
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 20000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgarylai[0m ([33mjohnny-gary[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
300,0.3397,0.323195,0.91
600,0.2986,0.369262,0.91
900,0.1872,0.48169,0.91
1200,0.0615,0.769368,0.88
1500,0.0235,0.848982,0.9
1800,0.008,0.930836,0.9
2100,0.0002,0.99009,0.9
2400,0.0001,1.030447,0.9
2700,0.0001,1.064963,0.89
3000,0.0001,1.092522,0.89


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to ./subset_classifier_results\checkpoint-500
Configuration saved in ./subset_classifier_results\checkpoint-500\config.json
Model weights saved in ./subset_classifier_results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to ./subset_classifier_results\checkpoint-1000
Configuration saved in ./subset_classifier_results\checkpoint-1000\config.json
Model weights saved in ./subset_classifier_results\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to ./subset_classifier_results\checkpoint-1500
Configuration saved in ./subset_classifier_results\checkpoint-1500\config.json
Model weights saved in ./subset_classi

Configuration saved in ./subset_classifier_results\checkpoint-12000\config.json
Model weights saved in ./subset_classifier_results\checkpoint-12000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to ./subset_classifier_results\checkpoint-12500
Configuration saved in ./subset_classifier_results\checkpoint-12500\config.json
Model weights saved in ./subset_classifier_results\checkpoint-12500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to ./subset_classifier_results\checkpoint-13000
Configuration saved in ./subset_classifier_results\checkpoint-13000\config.json
Model weights saved in ./subset_classifier_results\checkpoint-13000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model 