In [None]:
import copy
import warnings
import numpy as np 
import pandas as pd
from typing import List
import matplotlib.pyplot as plt
from numpy.random import choice
from scipy.cluster.hierarchy import linkage
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from scipy.cluster.hierarchy import dendrogram, linkage,to_tree
from collections import defaultdict
from scipy.cluster.hierarchy import ClusterNode
import warnings
warnings.filterwarnings('ignore')

In [None]:
import random

In [None]:
# do not edit this cell
# load the data files (download from the LMS)
embedded_images = np.load('images.npy')
labels = np.load('labels.npy')

# split into pool & testing
X_pool, X_test, y_pool, y_test = train_test_split(embedded_images, labels, 
                                                  test_size=0.5, random_state=1234, shuffle=True)

# sample a seed set
np.random.seed(1234)
label2id = defaultdict(list)
for i, label in enumerate(y_pool):
    label2id[label].append(i)
seed_set = []
for label, ids in label2id.items():
    seed_set.extend(np.random.choice(ids, size=10, replace=False))

In [None]:
labelset, y_label = np.unique(y_pool, return_inverse=True)

In [None]:
mylinkage = linkage(X_pool,method="average")

In [None]:
def Cluster_adaptive_active_learning(X,y,max_size,mylinkage):
    P_list = []
    #X_linkage = linkage(X,method='average')
    Tree = myHS(mylinkage,y)
    P = np.array([Tree.rootnode.id])
    ######################
    queries = []
    while len(queries) < max_size:

        v = Tree.select_LB(P)    
        z = Tree.rand_pick(v,queries)
        queries.append(z.id)
        label = y[z.id] 
            # print("sample node: ",z.id,"actual label: ",label)  
            # print()      
        Tree.update_empirical_counts_proba(z,v,label)
            #Tree.update_empirical(z,v,label)
        Tree.update_admissible()
        Tree.update_score()
        P = Tree.update_L_P(P)
        P_list.append(P)
    #return P_list


In [None]:
Cluster_adaptive_active_learning(X_pool,y_label,3000,mylinkage)

In [None]:
data_y

In [None]:
class myHS(object):
    def __init__(self, linkage:np.ndarray,y):
        keys = set(y)
        d = {key: 0 for key in keys}

        # The number of class
        self.n_class = len(keys)

        # Beta value used to calculate the admissble
        self.beta = 2

        # Root node and all node list
        self.rootnode,self.nodelist  = to_tree(linkage,rd = True)
        
        # Add parents to each node
        self.complete_tree(self.rootnode, self.nodelist)
        
        # the number of node in the tree
        self.n_nodes = len(self.nodelist)

        #the count of each class in points sampled from node.
        #self.label_count = np.array([d for i in range(self.n_nodes)])          
        self.label_count = np.zeros((self.n_nodes,self.n_class))

        #the fraction of each class in points sampled from node. 
        self.proba = np.zeros((self.n_nodes,self.n_class))
           
        
        # Record the number of 
        self.n = np.zeros(self.n_nodes)

        # Record if the node is admissible or not
        self.admissible = np.array(np.array([False]*self.n_nodes*self.n_class).reshape(self.n_nodes,self.n_class))

        # Record score for each node
        self.score = np.zeros(len(self.nodelist))

        # Record if achieve score
        self.prune = np.zeros(len(self.nodelist))


        # Record the labels
        self.label = np.zeros(len(self.nodelist))

        # Record the Lower bound and Upper Bound
        self.UB = np.zeros((self.n_nodes,self.n_class))
        self.LB = np.zeros((self.n_nodes,self.n_class))

    def complete_tree(self,rootnode,nodelist):
        rootnode.parent = None
        for node in nodelist:
            if node.get_left():
                node.left.parent = node
            if node.get_right():
                node.right.parent = node
            node.sampled = 0
    
   
    def select(self,P):
        
        p_UB = self.UB[P]
        L = self.label[P].astype(int)
        w = np.array([self.nodelist[node].count for node in P])
        p_UB = p_UB[np.arange(len(p_UB)),L]

        count = w *(1-p_UB)

        return random.choices(P,weights = count/sum(count) )[0]


    def select_LB(self,P):
        
        p_LB = self.LB[P]
        L = self.label[P].astype(int)

        w = np.array([self.nodelist[node].count for node in P])

        p_LB = p_LB[np.arange(len(p_LB)),L]

        count = w *(1-p_LB)
        # if np.sum(prob) == 0:
        #     return choice(prunning)
        return random.choices(P,weights = count/sum(count) )[0]
    
    def leaves(self, node,leaf_list = None):
        # Get all the leaf node from the root, not child node, but leaf node
        node = self.nodelist[node]
        if not leaf_list:
            leaf_list = []
        if node.is_leaf():
            leaf_list.append(node)
            return leaf_list
        else:
            leaf_list = self.leaves(node.left.id,leaf_list)
            leaf_list = self.leaves(node.right.id,leaf_list)
            
            return leaf_list

    def rand_pick(self,node,queries,leaf_list=None):
        
        leaves_list = self.leaves(node)
        leaves_id = np.array([leaf.id for leaf in leaves_list])
        queries = np.array(queries)

        not_query = np.setdiff1d(leaves_id,queries)
        print(f"Leaves: {len(leaves_id)}, Queries: {len(queries)}, Not query: {len(not_query)}")
        if len(not_query) == 0:
            return self.nodelist[random.choice(queries)]
        else:
            return random.choice(leaves_list)

    def update_empirical_counts_proba(self,
                          current, 
                          v,
                          label):

        current = self.nodelist[current] if not isinstance(current,ClusterNode) else current
        v = self.nodelist[v] if not isinstance(v,ClusterNode) else v
 
        # make sure if 
        while current and current.id <= v.id:
 

            # n * label matrix, 记录每个node都被label成什么
            self.label_count[current.id][label]+=1
            # n 记录每个node被sample几次
            self.n[current.id] +=1
            current = current.parent
     
        self.n = self.n.reshape((len(self.n), 1))      

        self.proba = self.label_count/self.n 


    def update_admissible(self):
        delta = 1/self.n + np.sqrt(self.proba*(1-self.proba)/self.n)
        lower = np.fmax(self.proba-delta,0)
        upper = np.fmin(self.proba+delta,1)
        
        for l in np.arange(self.n_class):
            upper_lp = np.delete(upper,l,axis = 1)
            self.admissible[:,l] = np.all((1-lower[:,l][:,None])<self.beta*(1-upper_lp),axis = 1)
        
        
        self.E_vl = 1-self.proba
        self.E_vl[~self.admissible] = 1
        
        self.UB = upper
        self.LB = lower
    
    def update_score(self):
        
        for i in range(len(self.nodelist)):
            current_node = self.nodelist[i]
            if current_node.is_leaf():
                self.score[i] = np.nanmin(self.E_vl[i])
                self.prune[i] = False
            else:
                
                if self.admissible[i,:].any():
                    
                    a_score = (current_node.left.count/current_node.count) * self.score[current_node.left.id]
                    b_score = (current_node.right.count/current_node.count) * self.score[current_node.right.id]
                    if a_score+b_score < np.nanmin(self.E_vl[i]):
                        self.score[i] = a_score+b_score
                        self.prune[i] = True
                    else:
                        self.score[i] = np.nanmin(self.E_vl[i])
                        
                        self.prune[i] = False
                else:
                    
                    self.score[i] = np.nanmin(self.E_vl[i])
                    self.prune[i] = False

    def recursively_update_label(self,node,P):

        
        if not isinstance(node, ClusterNode):
            node = self.nodelist[node]
        if not isinstance(P, ClusterNode):
            P = self.nodelist[P]

        if node.is_leaf():
            self.label[node.id] = self.label[P.id]
        else:
            self.recursively_update_label(node.left,P)
            self.recursively_update_label(node.right,P)

    def update_L_P(self,P):
        P_prime = []
        for i in P:
            node = self.nodelist[i]
            if self.prune[i]:

                label = np.where(self.admissible[i,:])[0][0]
                P_prime.append(node.left.id)
                P_prime.append(node.right.id)

                self.label[node.left.id] = label
                self.label[node.right.id] = label
            else:
                P_prime.append(i)
                self.label[node.id] = np.nanargmin(self.E_vl[i])
                
        for p in P_prime:
            self.recursively_update_label(p,p)
        return P_prime

