<a href="https://colab.research.google.com/github/dineshgumray/ActiveLearning/blob/main/EmbeddingKMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install small-text[transformers]  # use "small-text" without "[transformers]" if you want to work on the CPU only
%pip install datasets

In [18]:
import datasets
import logging
import numpy as np


# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET

raw_dataset = datasets.load_dataset('banking77')
num_classes = np.unique(raw_dataset['train']['label']).shape[0]

print('First 10 training samples:\n')
for i in range(10):
    print(raw_dataset['train']['label'][i], ' ', raw_dataset['train']['text'][i])



First 10 training samples:

11   I am still waiting on my card?
11   What can I do if my card still hasn't arrived after 2 weeks?
11   I have been waiting over a week. Is the card still coming?
11   Can I track my card while it is in the process of delivery?
11   How do I know if I will get my card, or if it is lost?
11   When did you send me my new card?
11   Do you have info about the card on delivery?
11   What do I do if I still have not received my new card?
11   Does the package with my card have tracking?
11   I ordered my card but it still isn't here


In [19]:
import transformers
from transformers import AutoTokenizer

transformers.logging.get_verbosity = lambda: logging.NOTSET


transformer_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)

In [20]:
from small_text.integrations.transformers.datasets import TransformersDataset


def get_transformers_dataset(tokenizer, data, labels, max_length=60):

    data_out = []

    for i, doc in enumerate(data):
        encoded_dict = tokenizer.encode_plus(
            doc,
            add_special_tokens=True,
            padding='max_length',
            max_length=max_length,
            return_attention_mask=True,
            return_tensors='pt',
            truncation='longest_first'
        )

        data_out.append((encoded_dict['input_ids'], encoded_dict['attention_mask'], labels[i]))

    return TransformersDataset(data_out)


train = get_transformers_dataset(tokenizer, raw_dataset['train']['text'], raw_dataset['train']['label'])
test = get_transformers_dataset(tokenizer, raw_dataset['test']['text'], raw_dataset['test']['label'])

In [21]:
from small_text.active_learner import PoolBasedActiveLearner

from small_text.initialization import random_initialization_balanced
from small_text.integrations.transformers import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.query_strategies import PredictionEntropy,LeastConfidence
from small_text.query_strategies import BreakingTies
from small_text.query_strategies import EmbeddingKMeans
from small_text.integrations.transformers import TransformerModelArguments


# simulates an initial labeling to warm-start the active learning process
def initialize_active_learner(active_learner, y_train):
    

    x_indices_initial = random_initialization_balanced(y_train, n_samples=1000)
    y_initial = y_train[x_indices_initial]

    active_learner.initialize_data(x_indices_initial, y_initial)

    return x_indices_initial

transformer_model = TransformerModelArguments(transformer_model_name)
clf_factory = TransformerBasedClassificationFactory(transformer_model, 
                                                    num_classes, 
                                                    kwargs=dict({
                                                                 'mini_batch_size': 32,
                                                                 'early_stopping_no_improvement': -1
                                                                }))
query_strategy = EmbeddingKMeans()

active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
labeled_indices = initialize_active_learner(active_learner, train.y)

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [17]:
f = open("/content/drive/MyDrive/EmbeddingKMeans_log.txt", "a")

In [22]:
import sys
np.set_printoptions(threshold=sys.maxsize)

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

num_queries = 5

def evaluate(active_learner, train, test):
    y_pred = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    test_acc = accuracy_score(y_pred_test, test.y)
    train_acc = accuracy_score(y_pred, train.y)

    print("Train accuracy: {}".format(train_acc))
    print("Test accuracy: {}".format(test_acc))
    print("Train Report \n")
    print(classification_report(train.y, y_pred))
    print("Test Report \n")
    print(classification_report(test.y, y_pred_test))
    f.write("\n Train Report \n")
    f.write(classification_report(train.y, y_pred))
    f.write("\n Test Report \n")
    f.write(classification_report(test.y, y_pred_test))
    f.write("\n")
    f.write("Train accuracy: ")
    f.write(str(train_acc))
    f.write("\n")
    f.write("Test accuracy: ")
    f.write(str(test_acc))
    return test_acc


results = []
results.append(evaluate(active_learner, train[labeled_indices], test))

    
for i in range(num_queries):
    # ...where each iteration consists of labelling 1000 samples
    q_indices = active_learner.query(num_samples=1000)

    # Simulate user interaction here. Replace this for real-world usage.
    y = train.y[q_indices]

    # Return the labels for the current query to the active learner.
    active_learner.update(y)

    labeled_indices = np.concatenate([q_indices, labeled_indices])

    print('Iteration #{:d} ({} samples)'.format(i, len(labeled_indices)))
    f.write("\n############################################################\n")
    f.write("Iteration ")
    f.write(str(i+1))
    f.write("\t Samples ")
    f.write(str(len(labeled_indices)))
    results.append(evaluate(active_learner, train[labeled_indices], test))
    
f.close()

Train accuracy: 0.887
Test accuracy: 0.6175324675324675
Train Report 

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       0.91      0.83      0.87        12
           2       1.00      1.00      1.00        14
           3       1.00      0.83      0.91        12
           4       1.00      1.00      1.00        12
           5       0.91      0.77      0.83        13
           6       0.91      0.83      0.87        12
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00        15
           9       0.88      0.93      0.90        15
          10       0.92      0.92      0.92        12
          11       0.92      0.92      0.92        13
          12       1.00      1.00      1.00        13
          13       1.00      0.92      0.96        13
          14       1.00      0.83      0.91        12
          15       0.76      0.93      0.84        14
          

100%|██████████| 10003/10003 [01:10<00:00, 141.70it/s]


Iteration #0 (2000 samples)
Train accuracy: 0.948
Test accuracy: 0.8224025974025974
Train Report 

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        27
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        25
           3       1.00      1.00      1.00        20
           4       1.00      1.00      1.00        31
           5       0.84      0.97      0.90        33
           6       0.83      0.94      0.88        31
           7       1.00      1.00      1.00        28
           8       0.97      1.00      0.99        34
           9       1.00      0.91      0.95        22
          10       1.00      0.85      0.92        20
          11       0.87      1.00      0.93        34
          12       0.96      1.00      0.98        26
          13       0.93      0.93      0.93        30
          14       0.96      0.90      0.93        29
          15       0.97      1.00   

100%|██████████| 10003/10003 [01:10<00:00, 141.49it/s]


Iteration #1 (3000 samples)
Train accuracy: 0.9833333333333333
Test accuracy: 0.8909090909090909
Train Report 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        38
           2       1.00      1.00      1.00        34
           3       1.00      1.00      1.00        28
           4       1.00      1.00      1.00        39
           5       0.96      0.94      0.95        53
           6       1.00      0.98      0.99        49
           7       1.00      0.98      0.99        43
           8       1.00      0.98      0.99        50
           9       1.00      0.97      0.98        33
          10       0.97      1.00      0.98        29
          11       0.98      0.98      0.98        50
          12       0.98      1.00      0.99        40
          13       1.00      1.00      1.00        47
          14       1.00      0.98      0.99        43
          15       1.00

100%|██████████| 10003/10003 [01:10<00:00, 141.51it/s]


Iteration #2 (4000 samples)
Train accuracy: 0.985
Test accuracy: 0.9152597402597402
Train Report 

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        41
           1       1.00      1.00      1.00        45
           2       1.00      1.00      1.00        39
           3       1.00      1.00      1.00        43
           4       1.00      0.98      0.99        49
           5       1.00      0.96      0.98        76
           6       1.00      1.00      1.00        66
           7       0.98      0.93      0.96        59
           8       1.00      1.00      1.00        64
           9       1.00      1.00      1.00        40
          10       0.98      1.00      0.99        40
          11       0.98      1.00      0.99        62
          12       1.00      0.98      0.99        53
          13       1.00      1.00      1.00        58
          14       0.98      0.97      0.97        60
          15       0.98      0.98   

100%|██████████| 10003/10003 [01:10<00:00, 141.64it/s]


Iteration #3 (5000 samples)
Train accuracy: 0.984
Test accuracy: 0.9191558441558442
Train Report 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        46
           1       1.00      1.00      1.00        52
           2       1.00      0.98      0.99        43
           3       1.00      1.00      1.00        51
           4       1.00      1.00      1.00        53
           5       0.96      0.98      0.97       101
           6       0.98      0.99      0.98        84
           7       0.97      0.96      0.97        77
           8       1.00      0.99      0.99        86
           9       1.00      1.00      1.00        56
          10       1.00      1.00      1.00        51
          11       0.99      0.96      0.97        81
          12       1.00      0.97      0.98        65
          13       1.00      1.00      1.00        72
          14       0.96      1.00      0.98        70
          15       0.97      1.00   

100%|██████████| 10003/10003 [01:10<00:00, 141.62it/s]


Iteration #4 (6000 samples)
Train accuracy: 0.9851666666666666
Test accuracy: 0.9292207792207792
Train Report 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60
           1       1.00      1.00      1.00        61
           2       1.00      0.98      0.99        52
           3       1.00      0.98      0.99        56
           4       1.00      1.00      1.00        61
           5       0.99      0.97      0.98       117
           6       0.98      1.00      0.99       100
           7       1.00      0.99      0.99        97
           8       1.00      1.00      1.00        99
           9       1.00      0.97      0.98        65
          10       1.00      0.98      0.99        54
          11       0.96      0.98      0.97        97
          12       0.99      0.96      0.97        80
          13       0.99      0.99      0.99        86
          14       0.98      0.99      0.98        80
          15       0.97