In [7]:
import numpy as np
import pandas as pd
import os
import random
import matplotlib.pyplot as plt
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Dropout, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from scipy.stats import entropy

In [8]:
random.seed(42)

In [14]:
df_train=pd.read_csv(r'C:/Users/D070651/Documents/Uni/thesis/Project_implementation/ActiveLearning/Datasets/DBpedia/DBPEDIA_train.csv').sample(5000, random_state=42)
df_val=pd.read_csv(r'C:/Users/D070651/Documents/Uni/thesis/Project_implementation/ActiveLearning/Datasets/DBpedia/DBPEDIA_val.csv').sample(5000, random_state=42)


In [18]:
df_train["l1"].unique()

array(['Work', 'Place', 'Species', 'Agent', 'Event', 'UnitOfWork',
       'SportsSeason', 'TopicalConcept', 'Device'], dtype=object)

In [19]:
print(df_train)

                                                     text       l1  \
174655  The Musical Jigsaw Play is a 1994 family music...     Work   
24221   Water Eaton House Bridge is a footbridge acros...    Place   
18945   The European Coatings Journal is an English-la...     Work   
192759  Penicillium viridicatum is a psychrophilic spe...  Species   
197276  Newark Beth Israel Medical Center, previously ...    Place   
...                                                   ...      ...   
170420  Cleo, Camping, Emmanuelle and Dick is a 1998 p...     Work   
150753  Paddy O'Keeffe (born 1864) was an Irish hurler...    Agent   
47272   Douglas Lee Beaudoin (born May 15, 1954 in Dic...    Agent   
228871  Sergeant John Pointon Beech (May 1, 1844 – Nov...    Agent   
118277  Nicholas Viscardi (October 20, 1920 – November...    Agent   

                            l2                      l3  
174655             WrittenWork                    Play  
24221    RouteOfTransportation               

In [20]:
tokenizer=Tokenizer(oov_token="'oov'")
tokenizer.fit_on_texts(df_train['text'])

In [21]:
maxlen = 200
train_X = pad_sequences(tokenizer.texts_to_sequences(df_train['text']), maxlen=maxlen)
val_X = pad_sequences(tokenizer.texts_to_sequences(df_val['text']), maxlen=maxlen)

In [22]:
enc = LabelEncoder()
enc.fit(df_train["l1"])
train_Y = to_categorical(enc.transform(df_train["l1"]))
val_Y = to_categorical(enc.transform(df_val["l1"]))

In [25]:
glove_dir="C:/Users/D070651/Documents/Uni/thesis/Project_implementation/ActiveLearning/Datasets"

embedding_index = {}
f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()
print('Found %s word vectors ' % len(embedding_index))

Found 400000 word vectors 


In [26]:
max_words = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((max_words,embedding_dim))

for word, idx in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

In [27]:
def train_model(X, Y, pool):
    model=Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(9, activation="softmax"))
    model.compile(optimizer="Adam", loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X, Y, epochs=20, batch_size=64, verbose=0)
    
    val_acc = accuracy_score([np.argmax(p) for p in val_Y], [np.argmax(p) for p in model.predict(val_X)])
    pool_predictions = model.predict(pool)
    return val_acc, pool_predictions

In [28]:
class Dataset:
    def __init__(self, X, Y):
        self._X = X
        self._Y = Y
        self._labeled = np.array([False for _ in range(0, len(self._X))])
    
    @property
    def pool(self):
        return self._X
    
    @property
    def X(self):
        return self._X[self._labeled]
    
    @property
    def Y(self):
        return self._Y[self._labeled]
    
    def random_sampling(self, batch_size):
        not_labeled = np.where(self._labeled == False)[0]
        new_labels = []
        while len(new_labels) < batch_size:
            r = random.randrange(0, len(not_labeled))
            if not_labeled[r] not in new_labels:
                new_labels.append(not_labeled[r])
        self._labeled[new_labels] = True
    
    def lc_sampling(self, batch_size, predictions):
        lc = sorted([(1 - p[np.argmax(p)], i) for i, p in enumerate(predictions)], reverse=True)
        self._label_batch(lc, batch_size)
                
    def margin_sampling(self, batch_size, predictions):
        ms = sorted([(p[np.argsort(p)[-1]] - p[np.argsort(p)[-2]], i) for i, p in enumerate(predictions)])
        self._label_batch(ms, batch_size)
    
    def entropy_sampling(self, batch_size, predictions):
        es = sorted([(entropy(p), i) for i, p in enumerate(predictions)], reverse=True)
        self._label_batch(es, batch_size)
        
    def _label_batch(self, sorted_candidates, batch_size):
        i = 0
        for _, j in sorted_candidates:
            if not self._labeled[j]: #if not already labeled
                self._labeled[j] = True
                i += 1
            if i >= batch_size:
                break

In [29]:
def active_learning(query_strategy, seed_size, batch_size, num_steps):
    """
    query_strategy - 'lc' for Least confidence sampling
                   - 'ms' for Margin sampling
                   - 'es' for Entropy sampling
                   - 'rs' for Random sampling
    """
    assert query_strategy in ["lc", "ms", "es", "rs"], "Unknown query strategy"
    accuracies = []
    d = Dataset(train_X, train_Y)
    d.random_sampling(seed_size)
    acc, predictions = train_model(d.X, d.Y, d.pool)
    accuracies.append(acc)
    for _ in tqdm(range(0, num_steps)):
        if query_strategy == "lc":
            d.lc_sampling(batch_size, predictions)
        elif query_strategy == "ms":
            d.margin_sampling(batch_size, predictions)
        elif query_strategy == "es":
            d.entropy_sampling(batch_size, predictions)
        elif query_strategy == "rs":
            d.random_sampling(batch_size)
        acc, predictions = train_model(d.X, d.Y, d.pool)
        accuracies.append(acc)
    return accuracies


In [30]:
seed_size=100
batch_size=50
num_steps=98

In [None]:
random_accuracies = active_learning("rs", seed_size, batch_size, num_steps)

 71%|██████████████████████████████████████████████████████████▌                       | 70/98 [22:52<18:24, 39.46s/it]

In [None]:
lc_accuracies = active_learning("lc", seed_size, batch_size, num_steps)

In [None]:
ms_accuracies = active_learning("ms", seed_size, batch_size, num_steps)

In [None]:
es_accuracies = active_learning("es", seed_size, batch_size, num_steps)

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), random_accuracies, color="b", label="Random Sampling")
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), lc_accuracies, color="g", label="Least Confidence Sampling")
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), ms_accuracies, color="r", label="Margin Sampling")
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), es_accuracies, color="y", label="Entropy Sampling")
plt.legend(loc="lower right")
plt.title("Active Learning on DBPedia Classes Dataset")
plt.ylabel('Accuracy')
plt.xlabel('Labeled data')
plt.grid()

# Accuracy after labeling 2000 data points¶


In [None]:
idx = np.where(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size) == 2000)[0][0]

In [None]:
random_accuracies[idx]

In [None]:
lc_accuracies[idx]

# Number of labeled data points required to the reach accuracy of 85%¶


In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return seed_size + idx * batch_size

In [None]:
find_nearest(random_accuracies, 0.85) #random sampling

In [None]:
find_nearest(lc_accuracies, 0.85) #least confidence

In [None]:
#This Notebook has been released under the Apache 2.0 open source license.