# COMP90051 Project 2
**Name:** *enter your name here*

**Student ID:** *your id here*

In [1]:
# You can add additional imports here
import copy
import numpy as np
from matplotlib import pyplot as plt
import os
import random
from sklearn.model_selection import train_test_split
from collections import defaultdict

## 0. Loading the dataset

In [2]:
# do not edit this cell
# load the data files (download from the LMS)
embedded_images = np.load('images.npy')
labels = np.load('labels.npy')  # 19280 excerpts

# split into pool & testing
X_pool, X_test, y_pool, y_test = train_test_split(embedded_images, labels, 
                                                  test_size=0.5, random_state=1234, shuffle=True)

# sample a seed set
np.random.seed(1234)
label2id = defaultdict(list)
for i, label in enumerate(y_pool):
    label2id[label].append(i)  # ids in each class
seed_set = []
for label, ids in label2id.items():
    seed_set.extend(np.random.choice(ids, size=10, replace=False))

## 1. Applying logistic regression

In [3]:
from sklearn.linear_model import LogisticRegression

def train_logistic_regression(X, y, **args):
    """
    Train a logistic regression model on dataset (X, y) and return trained model.
    X: matrix of real values, size n x d
    y: vector of string labels, size n
    args: optional arguments e.g., for hyper-parameters
    """
    # your code here
    clf = LogisticRegression(solver='liblinear')
    clf.fit(X, y)
    return clf

In [4]:
def evaluate_logistic_regression_accuracy(Xt, yt, model):
    """
    Apply logistic regression prediction on dataset Xt and evaluate accuracy against yt,
    returing the accuracy results as a scalar.
    Xt: matrix of real values, size m x d
    yt: vector of string labels, size m
    """
    # your code here
    return model.score(Xt, yt)

In [5]:
# your code here for training, evaluating & plotting results
from sklearn import preprocessing
#label_enc = preprocessing.LabelEncoder()
#label_enc.fit(y_pool)

#print(label_enc.classes_)
#y_pool = label_enc.transform(y_pool)
#y_test = label_enc.transform(y_test)
#print(len(X_pool), len(y_pool))
# 换掉solver之后也不需要label encoder了
model = train_logistic_regression(X_pool, y_pool)

#model = train_logistic_regression(X_pool, y_pool)
#test_res = evaluate_logistic_regression_accuracy(X_test, y_test, model)

In [7]:
test_res = evaluate_logistic_regression_accuracy(X_test, y_test, model)
print(test_res)

0.6011410788381742


# 2. Active learning framework with Random selection

In [5]:
def random_select(X, model, **args):
    """
    Given an unlabelled dataset X, a matrix of n x d, and a model (not used)
    returns a vector of scores of length n. Each entry reflects the priority 
    of the corresponding instance. Higher means better.
    """
    # fill in
    # The scores are all random numbers. scores.shape = [n, 1]
    return np.random.rand(len(X))


In [27]:
def pool_based_active_learning(X_pool, y_pool, seed_ids,
                               train_func, select_func,
                               max_size, batch_size, **args):
    """
    Perform an active learning simulation, which starts by training on a seed set,
    then iteratively applies the selection function to rank instances in the pool,
    selects the top few instances which are included into the training set and the
    process repeats. 
        X_pool: matrix of n x d
        y_pool: vector of string labels, size n
        seed_ids: initial labelled set set, as a list of indices [0..n-1] into pool
        train_func: function which given (X, y, optional args) returns a trained model
        select_func: function which given (X, optional args) returns a sequence of scores
        max_size: stopping condition for active learning, when labelled data reaches given size
        batch_size: number of instances to be labelled in each iteration
        args: optional arguments passed to training and selection function
    returns the sequence of trained models 
    """
  
    # fill in
    # Get the seed_set from X_pool
    seed_ids = copy.deepcopy(seed_ids)
    all_ids = list(range(0, len(y_pool)))
    pool_ids = [idx for idx in range(len(X_pool)) if idx not in seed_ids]
    
    model = None
    while len(seed_ids) <= max_size:
        print(len(seed_ids), end='  ')
        seed_X = [X_pool[idx] for idx in seed_ids]
        seed_y = [y_pool[idx] for idx in seed_ids]
        assert len(seed_X) == len(seed_y)

        model = train_func(seed_X, seed_y)
        scores = select_func(X_pool, model)
        
        score_with_ids = []
        for i, val in enumerate(scores):
            score_with_ids.append([val, i])
        #sorted(score_with_ids, reverse=True)
        
        # Only add those not-exist in seed_ids from score_with_idx into seed_ids
        select_for_seed(seed_ids, score_with_ids, batch_size)
        
    return model, seed_ids
            
def select_for_seed(seed_ids, score_with_ids, batch_size):
    count = 0
    score_with_ids = sorted(score_with_ids, reverse=True)
    #print(score_with_ids)
    for elem in score_with_ids:
        idx = elem[1]
        if idx not in seed_ids:
            seed_ids.append(idx)
            count += 1
        else:
            continue
        if count >= batch_size:
            return 
    
        
        
    
        

In [28]:
batch = 60
max_size = 600
models_random, seed1  = pool_based_active_learning(X_pool, y_pool, seed_set, 
                                    train_logistic_regression, random_select, 
                                    max_size, batch)

300  360  420  480  540  600  

In [29]:
## your code here for evaluation of accuracy and plotting of results
test_res = evaluate_logistic_regression_accuracy(X_test, y_test, models_random)
print(test_res)

0.3689834024896266


## 3. Uncertainty sampling

In [30]:
from scipy.stats import entropy
def logistic_regression_entropy_select(X, model, **args):
    """
    Given an unlabelled dataset X, a matrix of n x d, and a discriminative model 
    P(y|x), returns a vector of n entropy values.
    """
    # fill in
    pass
    # X.shape (N_samples, N_features)
    proba = model.predict_proba(X)
    # proba.shape (N_samples, N_classes)
    entropy_list = []
    for elem in proba:
        entropy_list.append(entropy(elem))
        
    return entropy_list
    

In [31]:
batch = 60
max_size = 600
models_us, seed2 = pool_based_active_learning(X_pool, y_pool, seed_set, 
                                 train_logistic_regression, 
                                 logistic_regression_entropy_select, 
                                 max_size, batch)

300  360  420  480  540  600  

In [34]:
## your code here for evaluation of accuracy and plotting of results
test_res = evaluate_logistic_regression_accuracy(X_test, y_test, models_us)
print(test_res)

0.3587136929460581


## 4. Query by committee

In [None]:
# don't forget to provide function descriptive comments, like those provided in templates above

def query_by_committee_vote_entropy(X, model, **args):
    pass

def query_by_committee_soft_vote_entropy(X, model, **args):
    pass

def query_by_committee_KL(X, model, **args):
    pass

In [None]:
def train_committee(X, y, **args):
    pass

In [None]:
## your code here for training, evaluation, and plotting code

## 5. Hierarchical sampling

In [1]:
## your code