In [1]:
# install the modAL package
!pip install modAL

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting modAL
  Downloading modAL-0.4.1-py3-none-any.whl (27 kB)
Installing collected packages: modAL
Successfully installed modAL-0.4.1


In [2]:
# import utilized packages
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from modAL.models import ActiveLearner
from sklearn.metrics import accuracy_score

In [3]:
# load the US8K dataset
features = np.load('./drive/MyDrive/dataset_US.npy',allow_pickle= True)
# load the data labels from the metadata file 
df = pd.read_csv('./drive/MyDrive/UrbanSound8K.csv', usecols=['classID'], dtype={'classID': np.int32})
data_class = df.to_numpy().flatten()

In [13]:
# determine the labeling classifier
labeler = OneVsRestClassifier(XGBClassifier(tree_method='gpu_hist', max_depth=50, max_leaves=80, random_state=13), n_jobs=-1)
# define a variable to keep the predicted labels
prediction = np.full_like(data_class, -1)
# specify the outlier detection algorithm
detector = LocalOutlierFactor(n_neighbors=1, n_jobs=-1)
# determine the status of each sample
status = detector.fit_predict(features)
# find inlier samples (top 2.5%)
inliers = []
for index, item in enumerate(status):
  if item == 1 and detector.negative_outlier_factor_[index] < -1.4:
      inliers.append(index)
# add inlier samples to the labeler's trainset
selection =  np.array(inliers, dtype=int).flatten() 
# randomly select 7.5% of the samples of each class 
known_indices = []
for class_index in range(10):
  # find same class samples
  class_indices = np.array(np.where(data_class == class_index)).flatten()
  # exclude the already selected data
  new_indices = np.setdiff1d(class_indices,selection)
  # select samples
  np.random.seed(6) 
  sample_count = 3*class_indices.shape[0] // 40
  selected_indices = np.random.choice(new_indices, size=sample_count, replace=False)
  known_indices.extend(selected_indices)
# add random samples to the labeler's trainset
selection = np.concatenate((selection, np.array(known_indices, dtype=int)))
# register the known labels
prediction[selection] = data_class[selection]
# consults the AL tool for top 5% of essential samples
learner = ActiveLearner(estimator=labeler, X_training=features[selection], y_training=data_class[selection])
query_idx, query_inst = learner.query(features, data_class.shape[0]//20)
# add the proposed samples to the labeler's trainset
selection = np.concatenate((selection, query_idx))
# register the known labels
prediction[selection] = data_class[selection]
# Run the Proposed Algorithm
for iteration in range(4): 
    # train the labeler
    labeler.fit(features[selection], prediction[selection])
    # predict the labels of all samples and register the prediction accuracy
    predicted_labels = labeler.predict(features)
    score = labeler.predict_proba(features)   
    # add the correctly labelled samples to the labelled samples set
    for index, label in enumerate(predicted_labels):
      #  inlier sample's predicted label is valid if its labelling accuracy exceeds 70 percent 
      if prediction[index] == -1 and status[index]==1:
        if score[index][int(predicted_labels[index])] > 0.7 :
          prediction[index] = label
    # update the labelled samples list
    selection = np.array(np.where(prediction != -1)).flatten()
# train the labeler for the last time
labeler.fit(features[selection], prediction[selection])
# predict the labels of samples
predicted_labels = labeler.predict(features)
# add the labels of the rest of the samples to the labelled samples set
for index, label in enumerate(predicted_labels):
  if prediction[index] == -1:
    prediction[index] = label
# report the labelling accuracy
print("Annotation Accuracy = ", accuracy_score(data_class, prediction))

Annotation Accuracy =  0.8977324782409528
