# Combining Active Suggestions

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
from time import time
from mclearn.experiment import ActiveExperiment, load_results, save_results
from mclearn.tools import log
from sklearn.externals import joblib
from matplotlib.ticker import FuncFormatter
%matplotlib inline
sns.set_style('white')
warnings.filterwarnings('ignore')  # Ignore annoying numpy warnings

## Experiment

Experiment Setup:

* 20-fold stratified shuffled split cross validation
* training pool size: 70% of data up to a maximum of 10,000 examples
* test pool size: the remaining examples up to a maximum of 20,000
* use logistic regression with gaussian kernel approximation and L2 loss

In [None]:
RUN_EXPERIMENTS = False
uci_sets = ['glass', 'ionosphere', 'magic', 'miniboone',
            'pageblocks', 'pima', 'sonar', 'vehicle', 'wpbc',
            'yeast', 'semeion'] 
datasets =  sorted(uci_sets + ['sdss'])
methods_al =  ['passive', 'margin', 'w-margin', 'confidence',
            'w-confidence', 'entropy', 'w-entropy',
            'qbb-margin', 'qbb-kl', 'passive']
methods_bandits = ['thompson', 'ocucb', 'klucb', 'exp++',]
methods_rank = ['borda', 'geometric', 'schulze']
methods = methods_al + methods_bandits + methods_rank

In [None]:
def run_expt(X, y, dataset, scale=True):
    log(dataset, end='')
    for method in methods:
        log('.', end='')
        expt = ActiveExperiment(X, y, dataset, method, scale, n_splits=20)
        expt.run_policies()
    
    expt = ActiveExperiment(X, y, dataset, None, scale)
    expt.run_asymptote()
    log('')

In [None]:
if RUN_EXPERIMENTS:
    for dataset in uci_sets:
        data_path = os.path.join('data', dataset + '.csv')
        data = pd.read_csv(data_path)
        X, y = data.iloc[:, 1:], data['target']
        run_expt(X, y, dataset)

    data_path = os.path.join('data', 'sdss.h5')
    data = pd.read_hdf(data_path, 'sdss')
    class_idx = data.columns.get_loc('class')
    X, y = data.iloc[:, (class_idx+1):], data['class']
    run_expt(X, y, 'sdss', False)

In [None]:
if RUN_EXPERIMENTS:
    for (i, dataset) in enumerate(datasets):
        maximum = {}
        measures = ['f1', 'accuracy', 'mpba']
        for measure in measures:
            asymptote_measure = 'asymptote_' + measure
            max_measure = 'max_' + measure
            results = {}
            for method in methods:
                results[method] = load_results(dataset, method, 'mpba', True)
            results['asymptote'] = load_results(dataset, 'asymptote', asymptote_measure, True)
            maximum[max_measure] = results['asymptote']
            for method in methods:
                maximum[max_measure] = max(maximum[max_measure], max(results[method]))
        save_results(dataset, 'max', maximum)

### No passive arm

In [None]:
def run_expt(X, y, dataset, scale=True):
    log(dataset, end='')
    for method in methods:
        log('.', end='')
        expt = ActiveExperiment(X, y, dataset, method, scale=scale, passive=False)
        expt.run_policies()

In [None]:
methods =  ['thompson', 'ocucb', 'klucb',
            'exp++', 'borda', 'geometric', 'schulze']

In [None]:
if RUN_EXPERIMENTS:
    for dataset in uci_sets:
        data_path = os.path.join('data', dataset + '.csv')
        data = pd.read_csv(data_path)
        X, y = data.iloc[:, 1:], data['target']
        run_expt(X, y, dataset)

    data_path = os.path.join('data', 'sdss.h5')
    data = pd.read_hdf(data_path, 'sdss')
    class_idx = data.columns.get_loc('class')
    X, y = data.iloc[:, (class_idx+1):], data['class']
    run_expt(X, y, 'sdss', False)

## Results

In [None]:
def calculate_strength(asymptote, passive, policy):
    deficiency = np.sum(asymptote - policy, axis=1) / np.sum(asymptote - passive, axis=1)
    strength = 1 - deficiency
    return strength

In [None]:
def plot_mpba_strength():
    fig = plt.figure(figsize=(15, 20))
    fig.subplots_adjust(hspace=.6)
    for (i, dataset) in enumerate(datasets):
        results = {}
        for method in methods:
            results[method] = load_results(dataset, method, 'mpba', mean=False)
        results['max'] = load_results(dataset, 'max', 'max_mpba')
        strength_dict = {}
        for method in methods:
            s = calculate_strength(results['max'], results['passive'], results[method])
            strength_dict[method] = s
        strength_df = pd.DataFrame(strength_dict)
        sorted_cols = (-strength_df.median()).sort_values().index
        strength_df = strength_df[sorted_cols]

        ax = fig.add_subplot(6, 2, i + 1)
        strength_df.index.name = 'trial'
        strength_df = strength_df.reset_index()
        strength_df = strength_df.melt(id_vars=['trial'], value_vars=methods)
        strength_df.loc[strength_df['variable'].isin(methods_al), 'type'] = 'single'
        strength_df.loc[strength_df['variable'].isin(methods_bandits), 'type'] = 'bandit'
        strength_df.loc[strength_df['variable'].isin(methods_rank), 'type'] = 'rank'
        # We could use hue here, but I think there is a bug in seaborn that squishes
        # the boxplot
        
        palette_map = {
            **{m: sns.color_palette()[0] for m in methods_al},
            **{m: sns.color_palette()[2] for m in methods_bandits},
            **{m: sns.color_palette()[1] for m in methods_rank},
        }
        sns.boxplot(data=strength_df, x='variable', y='value', order=sorted_cols, width=0.5,
                    palette=palette_map)
        ax.set_title(dataset)
#         ax.set_ylim(-0.4, 0.9)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, rotation_mode='anchor', ha='right')
        ax.xaxis.set_visible(True)
        
        # set bar width
        new_width = 0.5
        for bar in ax.patches:
            x = bar.get_x()
            width = bar.get_width()
            centre = x + new_width / 2.

            bar.set_x(centre - new_width / 2.)
            bar.set_width(new_width)
            
    #fig.savefig('strengths.pdf', bbox_inches='tight')
    plt.show()

In [None]:
uci_sets = ['glass', 'ionosphere', 'magic', 'miniboone',
            'pageblocks', 'pima', 'sonar', 'vehicle', 'wpbc'] 
datasets =  sorted(uci_sets + ['sdss'])

In [None]:
plot_mpba_strength()

In [None]:
def plot_learning_curves():
    selected_methods = ['passive', 'confidence', 'borda', 'exp++']
    format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
    fig = plt.figure(figsize=(15, 20))
    for (i, dataset) in enumerate(datasets):
        learning_curves = {}
        for method in selected_methods:a
            learning_curves[method] = load_results(dataset, method, 'mpba', True)
        maximum = load_results(dataset, 'max', 'max_mpba')
        sample_size = learning_curves['passive'].shape[0] + 49

        ax = fig.add_subplot(4, 3, i + 1)
        for method in selected_methods:
            xticks = np.arange(50, 50 + len(learning_curves[method]))
            ax.plot(xticks, learning_curves[method], label=method, linewidth=1)

        ax.legend(loc='lower right', frameon=True)
        ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))
        ax.set_title(dataset)
        ax.tick_params(top='off')

        ax.plot([50, sample_size], [maximum, maximum], ls='--', color='#377eb8')
        ax.set_xlim(50, sample_size)
    fig.savefig('learning_curves.pdf', bbox_inches='tight')
    plt.show()

In [None]:
plot_learning_curves()