<a href="https://colab.research.google.com/github/dineshgumray/ActiveLearning/blob/main/BreakingTies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install small-text[transformers]  # use "small-text" without "[transformers]" if you want to work on the CPU only
%pip install datasets

Collecting small-text[transformers]
  Downloading small_text-1.0.0b2-py3-none-any.whl (121 kB)
[?25l[K     |██▊                             | 10 kB 16.7 MB/s eta 0:00:01[K     |█████▍                          | 20 kB 22.2 MB/s eta 0:00:01[K     |████████                        | 30 kB 24.8 MB/s eta 0:00:01[K     |██████████▉                     | 40 kB 11.8 MB/s eta 0:00:01[K     |█████████████▌                  | 51 kB 11.2 MB/s eta 0:00:01[K     |████████████████▏               | 61 kB 12.9 MB/s eta 0:00:01[K     |███████████████████             | 71 kB 10.7 MB/s eta 0:00:01[K     |█████████████████████▋          | 81 kB 11.7 MB/s eta 0:00:01[K     |████████████████████████▎       | 92 kB 12.9 MB/s eta 0:00:01[K     |███████████████████████████     | 102 kB 12.3 MB/s eta 0:00:01[K     |█████████████████████████████▊  | 112 kB 12.3 MB/s eta 0:00:01[K     |████████████████████████████████| 121 kB 12.3 MB/s 
Collecting transformers>=4.0.0
  Downloading transform

In [2]:
import datasets
import logging
import numpy as np


# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET

raw_dataset = datasets.load_dataset('banking77')
num_classes = np.unique(raw_dataset['train']['label']).shape[0]

print('First 10 training samples:\n')
for i in range(10):
    print(raw_dataset['train']['label'][i], ' ', raw_dataset['train']['text'][i])

Using custom data configuration default


Downloading and preparing dataset banking77/default (download: 1.03 MiB, generated: 897.51 KiB, post-processed: Unknown size, total: 1.91 MiB) to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b...
Dataset banking77 downloaded and prepared to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b. Subsequent calls will reuse this data.
First 10 training samples:

11   I am still waiting on my card?
11   What can I do if my card still hasn't arrived after 2 weeks?
11   I have been waiting over a week. Is the card still coming?
11   Can I track my card while it is in the process of delivery?
11   How do I know if I will get my card, or if it is lost?
11   When did you send me my new card?
11   Do you have info about the card on delivery?
11   What do I do if I still have not received my new card?
11   Does the package with my card have tracking?
11

In [3]:
import transformers
from transformers import AutoTokenizer

transformers.logging.get_verbosity = lambda: logging.NOTSET


transformer_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [17]:
from small_text.integrations.transformers.datasets import TransformersDataset


def get_transformers_dataset(tokenizer, data, labels, max_length=60):

    data_out = []

    for i, doc in enumerate(data):
        encoded_dict = tokenizer.encode_plus(
            doc,
            add_special_tokens=True,
            padding='max_length',
            max_length=max_length,
            return_attention_mask=True,
            return_tensors='pt',
            truncation='longest_first'
        )

        data_out.append((encoded_dict['input_ids'],
                        encoded_dict['attention_mask'],
                        labels[i]))

    return TransformersDataset(data_out)


train = get_transformers_dataset(tokenizer, raw_dataset['train']['text'], raw_dataset['train']['label'])
test = get_transformers_dataset(tokenizer, raw_dataset['test']['text'], raw_dataset['test']['label'])

In [19]:
from small_text.active_learner import PoolBasedActiveLearner

from small_text.initialization import random_initialization_balanced
from small_text.integrations.transformers import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.query_strategies import BreakingTies
from small_text.integrations.transformers import TransformerModelArguments


# simulates an initial labeling to warm-start the active learning process
def initialize_active_learner(active_learner, y_train):
    

    x_indices_initial = random_initialization_balanced(y_train, n_samples=1000)
    y_initial = y_train[x_indices_initial]

    active_learner.initialize_data(x_indices_initial, y_initial)

    return x_indices_initial

transformer_model = TransformerModelArguments(transformer_model_name)
clf_factory = TransformerBasedClassificationFactory(transformer_model, 
                                                    num_classes, 
                                                    kwargs=dict({
                                                                 'mini_batch_size': 30,
                                                                 'early_stopping_no_improvement': -1
                                                                }))
query_strategy = BreakingTies()

active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
labeled_indices = initialize_active_learner(active_learner, train.y)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [22]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
f = open("/content/drive/MyDrive/BreakingTies_log.txt", "a")

In [24]:
import sys
np.set_printoptions(threshold=sys.maxsize)

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import pandas as pd

num_queries = 8


def evaluate(active_learner, train, test):
    y_pred = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    test_acc = accuracy_score(y_pred_test, test.y)
    f.write("\n")
    f.write(classification_report(train.y, y_pred))
    f.write("\n")
    f.write("Train accuracy: ")
    f.write(str(accuracy_score(y_pred, train.y)))
    f.write("\n")
    f.write("Test accuracy: ")
    f.write(str(test_acc))

    return test_acc


results = []
results.append(evaluate(active_learner, train[labeled_indices], test))

    
for i in range(num_queries):
    # ...where each iteration consists of labelling 1000 samples
    q_indices = active_learner.query(num_samples=1000)

    # Simulate user interaction here. Replace this for real-world usage.
    y = train.y[q_indices]

    # Return the labels for the current query to the active learner.
    active_learner.update(y)

    labeled_indices = np.concatenate([q_indices, labeled_indices])

    print('Iteration #{:d} ({} samples)'.format(i, len(labeled_indices)))
    f.write("#################################################################")
    f.write("Iteration ")
    f.write(str(i+1))
    f.write("\t Samples ")
    f.write(str(len(labeled_indices)))
    results.append(evaluate(active_learner, train[labeled_indices], test))

Iteration #0 (2000 samples)
Iteration #1 (3000 samples)
Iteration #2 (4000 samples)
Iteration #3 (5000 samples)
Iteration #4 (6000 samples)
Iteration #5 (7000 samples)
Iteration #6 (8000 samples)
Iteration #7 (9000 samples)


In [28]:

f.close()