In [None]:
#import jax.numpy as np
import numpy as np
import numpy as onp
import pandas as pd
#import jaxopt

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.random_projection import GaussianRandomProjection

In [None]:
from sklearn.datasets import load_svmlight_file

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
import matplotlib.pyplot as plt

In [None]:
from scipy.optimize import minimize as sp_minimize

In [None]:
import sys

In [None]:
import time

---
## Loading dataset

In [None]:
dataset_name='yeast'

In [None]:
X_train, y_train_ = load_svmlight_file(dataset_name+'_train.svm', multilabel=True)
X_train = onp.array(X_train.todense())
X_train.shape

In [None]:
X_test, y_test_ = load_svmlight_file(dataset_name+'_test.svm', multilabel=True)
X_test = onp.array(X_test.todense())
X_test.shape

In [None]:
onehot_labeller = MultiLabelBinarizer()
y_train = onehot_labeller.fit_transform(y_train_).astype(int)
y_test = onehot_labeller.transform(y_test_).astype(int)

In [None]:
labels = onehot_labeller.classes_.astype(int)
labels

In [None]:
plt.hist(y_train.sum(axis=1))

FYI: Error rate of null policy (always predict 0)

In [None]:
y_test.sum()/(y_test.shape[0])

---

In [None]:
def micro_hammingloss(p,y):
    assert p.shape == y.shape
    pos = np.where( (p != y) & (y > 0) )
    neg = np.where( (p != y) & (y == 0) )
    fn = p[neg].sum()
    fp = (1-p[pos]).sum()
    return (fn+fp)/(p.shape[0])

In [None]:
def macro_hammingloss(test_probas, y_test):
    return onp.mean([
        micro_hammingloss(test_probas[:,k].reshape((len(y_test),1)), 
                          y_test[:,k].reshape((len(y_test),1))) 
        for k in range(y_test.shape[1])
    ])

---
## Our Model

### CRM routines

In [None]:
def generate_crm_dataset(X, y, probas, n_samples=4, labels=labels):
    
    assert len(X) == len(y) == len(probas), (len(X) , len(y) , len(probas))
    
    P = []
    A = []
    F = []
    R = []
    
    for i in range(len(probas)):
        for k in range(n_samples):
            
            p = probas[i,:]
            p = p.astype('float32')
            p /= p.sum()
            
            a = onp.random.choice(labels, p=p)
            A += [a]
            
            p = p[a]
            P += [p]
            
            x = X[i]
            F += [x]

            R += [int(y[i,a] > 0)]
            
    assert len(P) == len(X) * n_samples

    return P, A, R, F

In [None]:
def arrayize_crm_dataset(P, A, R, F):
    P = onp.array(P).reshape((len(P),1))
    A = onp.array(A).reshape((len(P),1))
    R = onp.array(R).reshape((len(P),1))
    F = onp.vstack(F)
    return P, A, R, F

### Modeling

In [None]:
def build_action_embeddings(features, labels):
    
    def onehotsingleaction(a):
        r = onp.zeros(len(labels))
        r[a] = 1
        return r
    
    result = [
        [onp.hstack([feature, 
                     onehotsingleaction(action)]) for action in labels] 
        for feature in features
    ]
    result = onp.array(result)
    return result

In [None]:
def model_predict(parameter, queries, embeddings):
    exponents = np.exp(np.dot(embeddings, parameter))
    num = exponents[onp.arange(embeddings.shape[0]), queries]
    res =  num / np.sum(exponents, axis=1)
    return res

In [None]:
def model_predict_on_all_actions(parameter, embeddings):
    exponents = np.exp(np.dot(embeddings, parameter))
    den = np.sum(exponents, axis=1).reshape(len(embeddings),1)
    res =  exponents / den
    return res

In [None]:
def iterate_model(beta, X, y, sampling_probas, prior_crm_dataset, samples_per_instance=4):
    
    P, A, R, F = prior_crm_dataset
    
    l = len(P)
    
    newP, newA, newR, newF = generate_crm_dataset(
        X, y, sampling_probas, n_samples=samples_per_instance
    )
    assert len(newP) == len(X)*samples_per_instance, (len(newP), len(X), samples_per_instance)
    
    P += newP
    A += newA
    R += newR
    F += newF
    
    assert len(P) == len(newP) + l, (len(P), len(newP), l)
    
    P_, A_, R_, F_ = arrayize_crm_dataset(P, A, R, F)    
    phi = build_action_embeddings(F, labels)
    
    def fn(beta):
        pred = model_predict(beta, A_.squeeze(), phi)
        W = pred / P_.squeeze()
        l = np.sum((1-R_).squeeze()*W) / np.sum(W)
        return l
    
    solution = sp_minimize(fn, beta, method='L-BFGS-B')
    newbeta = solution.x
    
    return newbeta, (P, A, R, F)

In [None]:
def evaluate_model(beta, phi_test, y_test, normalize=False, binarize=True):
    beta_test_probas = model_predict_on_all_actions(beta, phi_test)
    if normalize:
        beta_test_probas /= beta_test_probas.sum(axis=1).reshape((len(y_test),1))
    if binarize:
        beta_test_probas = (beta_test_probas > .5).astype(int)
    return micro_hammingloss(beta_test_probas, y_test)

----
## Baselines & Skylines

 ![Perf from CRM article](./basesky.png)

In [None]:
fh = GaussianRandomProjection(n_components=1000)

X_train_h = fh.fit_transform(X_train)
X_test_h = fh.transform(X_test)
print(X_train_h.shape)

In [None]:
print("pi_null micro test loss:", micro_hammingloss(np.zeros(y_test.shape), y_test))

In [None]:
pi_dummy = MultiOutputClassifier(DummyClassifier())
pi_dummy.fit(X_train, y_train)

print("pi_dummy train loss:", micro_hammingloss(pi_dummy.predict(X_train), y_train))
print("pi_dummy test loss:", micro_hammingloss(pi_dummy.predict(X_test), y_test))

In [None]:
pi0 = MultiOutputClassifier(LogisticRegression(), n_jobs=6)

X_0, X_, y_0, y_ = train_test_split(X_train, y_train, test_size=.95, random_state=0)
print('learning pi0 on', len(X_0), 'data points')
pi0.fit(X_0, y_0)

print("pi0 train loss:", micro_hammingloss(pi0.predict(X_train), y_train))
l0 = micro_hammingloss(pi0.predict(X_test), y_test)
print("pi0 test loss:", l0)

In [None]:
pistar = MultiOutputClassifier(LogisticRegressionCV(max_iter=1000, n_jobs=6))
pistar.fit(X_train, y_train)

In [None]:
print("pi* train loss:", micro_hammingloss(pistar.predict(X_train), y_train))
lstar = micro_hammingloss(pistar.predict(X_test), y_test)
print("pi* test loss:", lstar)

---
## Sequential CRM

In [None]:
phi_test = build_action_embeddings(X_test, labels)
phi_train = build_action_embeddings(X_train, labels)

In [None]:
beta_init = onp.random.normal(size=len(labels)+X_train.shape[1])
print('beta0 test loss:', evaluate_model(beta_init, phi_test, y_test))

In [None]:
beta_static = np.array(beta_init.copy())
beta_dynamic = np.array(beta_init.copy())

static_crm_dataset = ([],[],[],[])
dynamic_crm_dataset = ([],[],[],[])

batch = 1000
n_episods = int(len(X_train) / batch)+1

t_end = t_start = time.time()
for episod in range(n_episods):
    t_end = time.time()
    
    start = episod*batch
    end = (episod+1)*batch
    print('*'*10, 
          'episod: %d/%d' % (episod, n_episods), 
          'time: %ds' % (t_end - t_start), 
          '*'*10,
          file=sys.stderr)
    
    t_start = time.time()
    X = X_train[0:end,:]
    y = y_train[0:end,:]
    
    sampling_probas_static = pi0.predict_proba(X)
    sampling_probas_static = np.array([_[:,1] for _ in sampling_probas_static]).T
    if episod == 0:
        sampling_probas_dynamic = sampling_probas_static
    else:
        phi_current = build_action_embeddings(X, labels)
        sampling_probas_dynamic = model_predict_on_all_actions(beta_dynamic, phi_current)

    beta_static, static_crm_dataset = iterate_model(
        beta_static, X, y, sampling_probas_static, static_crm_dataset
    )
    l_stat = evaluate_model(beta_static, phi_test, y_test)
    print('static   >', 
          'test loss: %.5f (vs pi0: %d%% vs pi*: %d%%)' % (l_stat, 100*l_stat/l0, 100*l_stat/lstar), 
          '|beta|=%.4f' % onp.sqrt((beta_static**2).sum()), 
          '|D_crm|:', len(static_crm_dataset[-1]),
          file=sys.stderr)

    beta_dynamic, dynamic_crm_dataset = iterate_model(
        beta_dynamic, X, y, sampling_probas_dynamic, dynamic_crm_dataset
    )
    l_dyn = evaluate_model(beta_dynamic, phi_test, y_test)
    print('dynamic  >', 
          'test loss: %.5f (vs pi0: %d%% vs pi*: %d%%)' % (l_dyn, 100*l_dyn/l0, 100*l_dyn/lstar), 
          '|beta|=%.4f' % onp.sqrt((beta_dynamic**2).sum()), 
          '|D_crm|:', len(dynamic_crm_dataset[-1]),
          file=sys.stderr)
