In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC

In [None]:
from sklearn.ensemble import BaggingClassifier
def qbb(X, y, model, rng):
    
    X_known = X[~y.mask]
    X_unknown = X[y.mask]

    clfs = BaggingClassifier(model, n_estimators=100, random_state=rng)

    clfs.fit(X_known, y[~y.mask].data)
    for es in clfs.estimators_:
        setattr(es, "classes_", [0, 1])
    pc = clfs.predict_proba(X_unknown)

    # # for clfs without predict_proba - not used in latest version  
    # pc += 1e-8
    # fitness = -np.sum(pc * np.log(pc), axis=1)
        
    p = np.array([clf.predict_proba(X_unknown) for clf in clfs.estimators_])
    fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0)

    return X_unknown, fitness

### QBB score visualsation

In [None]:
rng = np.random.RandomState(666)

n = 3000
X = np.vstack([rng.uniform(-1, 1, n) for _ in range(2)]).T

x1 = np.array([-0.5, 0.5])
x2 = np.array([0.5, -0.5])
y = np.zeros(3000)
y[np.where(np.sqrt(np.sum((X - x1)**2, axis=1)) < 0.3)[0]] = 1
y[np.where(np.sqrt(np.sum((X - x2)**2, axis=1)) < 0.2)[0]] = 2

g1 = np.where(y == 1)[0]
g2 = np.where(y == 2)[0]
neg = np.where(np.logical_and(y == 0, np.sqrt(np.sum((X - x1)**2, axis=1)) < 0.5))[0]
y[np.where(y == 2)] = 1
c = y.copy()

warm_start_1 = g1[rng.choice(g1.shape[0], 9, replace=False)]
warm_start_2 = g2[rng.choice(g2.shape[0], 1, replace=False)]
warm_start_3 = neg[rng.choice(neg.shape[0], 10, replace=False)]

warm_start = set(np.hstack([warm_start_1, warm_start_2, warm_start_3])) 
mask = [i not in warm_start for i in xrange(X.shape[0])]
c[list(warm_start)] = -1
my = np.ma.array(y, mask=mask)

model = GridSearchCV(estimator=SVC(kernel='rbf', probability=True, C=1e6,gamma=100), 
                     param_grid={"C": [1, 1e3, 1e6], "gamma": [0.1, 1, 10, 100]})

m = GridSearchCV(estimator=SVC(kernel='rbf', probability=True, C=1e6,gamma=100), 
                 param_grid={"C": [1, 1e3, 1e6], "gamma": [0.1, 1, 10, 100]})

m.fit(X[~np.array(mask)], y[~np.array(mask)])
setattr(m, "classes_", [0,1])
setattr(model, "classes_", [0,1])

uX, fitness = qbb(X, my, model, rng)

f = plt.figure(figsize=(15, 8))

ax1 = f.add_subplot(221)
ax1.scatter(X[:,0], X[:, 1], c=c, cmap=plt.cm.bwr)
red_patch = mpatches.Patch(color='red', label='positive class')
white_patch = mpatches.Patch(color='white', label='negative class', ec='k')
blue_patch = mpatches.Patch(color='blue', label='warm start')
ax1.legend(handles=[red_patch, white_patch, blue_patch], bbox_to_anchor=(0.82, 1), loc=2)
ax1.set_title("warm start conditions")

ax2 = f.add_subplot(222)
cb = ax2.scatter(uX[:, 0], uX[:, 1], c=fitness, cmap=plt.cm.coolwarm)
ax2.set_title("QBB score")
f.colorbar(cb)

ax3 = f.add_subplot(223)
v = m.predict_proba(uX)[:, 1]
ax3.scatter(uX[:, 0], uX[:, 1], c=v/v.max(), cmap=plt.cm.coolwarm)
ax3.set_title("Model proba on unknown only")

ax4 = f.add_subplot(224)
v = m.predict_proba(X)[:, 0]
ax4.scatter(X[:, 0], X[:, 1], c=v/v.max(), cmap=plt.cm.coolwarm)
ax4.set_title("Model proba on all X")

In [None]:
X_known = X[~my.mask]
X_unknown = np.array([[1.0, 1.0]])

clfs = BaggingClassifier(model, n_estimators=10, random_state=rng)

clfs.fit(X_known, my[~my.mask].data)
for es in clfs.estimators_:
    setattr(es, "classes_", [0, 1])
pc = clfs.predict_proba(X_unknown)

print "ensamble probs \n", pc

p = np.array([clf.predict_proba(X_unknown) for clf in clfs.estimators_])

print "probs per clf: \n", p

fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0)

print "score per clf: \n"
[clf.score(X, y) for clf in clfs.estimators_]
