<a href="https://colab.research.google.com/github/dineshgumray/ActiveLearning/blob/main/AL_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install small-text[transformers]  # use "small-text" without "[transformers]" if you want to work on the CPU only
%pip install datasets

In [None]:
import datasets
import logging
import numpy as np


# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET

raw_dataset = datasets.load_dataset('banking77')
num_classes = np.unique(raw_dataset['train']['label']).shape[0]

print('First 10 training samples:\n')
for i in range(10):
    print(raw_dataset['train']['label'][i], ' ', raw_dataset['train']['text'][i])

In [None]:
import transformers
from transformers import AutoTokenizer

transformers.logging.get_verbosity = lambda: logging.NOTSET


transformer_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)

In [4]:
from small_text.integrations.transformers.datasets import TransformersDataset


def get_transformers_dataset(tokenizer, data, labels, max_length=60):

    data_out = []

    for i, doc in enumerate(data):
        encoded_dict = tokenizer.encode_plus(
            doc,
            add_special_tokens=True,
            padding='max_length',
            max_length=max_length,
            return_attention_mask=True,
            return_tensors='pt',
            truncation='longest_first'
        )

        data_out.append((encoded_dict['input_ids'], encoded_dict['attention_mask'], labels[i]))

    return TransformersDataset(data_out)


train = get_transformers_dataset(tokenizer, raw_dataset['train']['text'], raw_dataset['train']['label'])
test = get_transformers_dataset(tokenizer, raw_dataset['test']['text'], raw_dataset['test']['label'])

In [29]:
from small_text.active_learner import PoolBasedActiveLearner

from small_text.initialization import random_initialization_balanced
from small_text.integrations.transformers import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.query_strategies import PredictionEntropy
from small_text.query_strategies import LeastConfidence
from small_text.query_strategies import BreakingTies
from small_text.integrations.transformers import TransformerModelArguments


# simulates an initial labeling to warm-start the active learning process
def initialize_active_learner(active_learner, y_train):
    

    x_indices_initial = random_initialization_balanced(y_train, n_samples=1000)
    y_initial = y_train[x_indices_initial]

    active_learner.initialize_data(x_indices_initial, y_initial)

    return x_indices_initial

transformer_model = TransformerModelArguments(transformer_model_name)
clf_factory = TransformerBasedClassificationFactory(transformer_model, 
                                                    num_classes, 
                                                    kwargs=dict({
                                                                 'mini_batch_size': 32,
                                                                 'early_stopping_no_improvement': -1
                                                                }))
query_strategy = BreakingTies()

active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
labeled_indices = initialize_active_learner(active_learner, train.y)

In [31]:
initial_labeled_indices = labeled_indices

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [13]:
f = open("/content/drive/MyDrive/log1.txt", "a")

In [9]:
active_learner.save('/content/drive/MyDrive/initial_active_leaner.pkl')

In [41]:
active_learner = PoolBasedActiveLearner.load('/content/drive/MyDrive/initial_active_leaner.pkl')

In [None]:
len(labeled_indices)

In [None]:
query_strategy = LeastConfidence()
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
print(active_learner.query_strategy)
labeled_indices = initial_labeled_indices
print(len(labeled_indices))
active_learner.initialize_data(labeled_indices, train.y[labeled_indices])

LeastConfidence()
1000


In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

def evaluate(active_learner, train, test):
    y_pred = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    test_acc = accuracy_score(y_pred_test, test.y)
    train_acc = accuracy_score(y_pred, train.y)

    print("Train accuracy: {}".format(train_acc))
    print("Test accuracy: {}".format(test_acc))
    print("Train Report \n")
    print(classification_report(train.y, y_pred))
    print("Test Report \n")
    print(classification_report(test.y, y_pred_test))
    f.write("\n Train Report \n")
    f.write(classification_report(train.y, y_pred))
    f.write("\n Test Report \n")
    f.write(classification_report(test.y, y_pred_test))
    f.write("\n")
    f.write("Train accuracy: ")
    f.write(str(train_acc))
    f.write("\n")
    f.write("Test accuracy: ")
    f.write(str(test_acc))
    return test_acc


results = []
results.append(evaluate(active_learner, train[labeled_indices], test))


In [None]:
num_queries = 5

for i in range(num_queries):
    # ...where each iteration consists of labelling 1000 samples
    q_indices = active_learner.query(num_samples=1000)

    # Simulate user interaction here. Replace this for real-world usage.
    y = train.y[q_indices]

    # Return the labels for the current query to the active learner.
    active_learner.update(y)

    labeled_indices = np.concatenate([q_indices, labeled_indices])

    print('Iteration #{:d} ({} samples)'.format(i, len(labeled_indices)))
    f.write("\n############################################################\n")
    f.write("Iteration ")
    f.write(str(i+1))
    f.write("\t Samples ")
    f.write(str(len(labeled_indices)))
    results.append(evaluate(active_learner, train[labeled_indices], test))
    
f.close()

In [None]:
active_learner.save('/content/drive/MyDrive/active_leaner.pkl')