# QBoost

これはAmplifyを用いたQBoostについてのデモです。

## QBoostとは

QBoostは機械学習の手法の一つで、アンサンブル学習（Ensemble Learning）といわれる分野に分類されます。より具体的には、ブースティングの思想を取り入れたバギングを行なっています。

アンサンブル学習は別々に学習させた分類器（Classifier）を組み合わせて推論させる学習方法のことで、独立に学習した分類器・回帰モデルの出力の多数決もしくは平均を取る手法（バギング）、過去に学習した分類器の出力を元に、学習済みの分類器の弱点を補うように新しい分類器を学習する手法（ブースティング）などがあります。

理論的には、バギングを使うことで予測のブレ（分散、バリアンス）を軽減することが期待でき、ブースティングを使用することで予測のズレ（偏り、バイアス）を減少させることが期待できます。バイアスとバリアンスがともに小さいモデルが理想ですが、この二つは互いにトレードオフの関係にあることが知られています。

アンサンブル手法の最も単純な実装は各分類器の出力の多数決をとるバギングですが、単純に多数決を取るだけではいわゆる"良くない"分類器が紛れ込んでいた場合に、思ったように精度が上がらないなどの問題があります。QBoostはそのような問題を解決するために生まれました。以下では、扱う問題の説明から始め、実際にAmplifyを使用して精度が向上することを確認します。

### デモで使用する手法について
- 多数決： 
 弱学習器の推論結果の多数決を全体の推論結果とする 
 `Run(Majority of vote)`ボタンで実行 
- QBoost(step 1)： 
 弱学習器の中から、ノイズとなるような精度が悪い分類器を取り除く 
 `Run(QBoost:step 1)`ボタンで実行 
- QBoost(step 2)： 
 QBoost(step 1)のアイデア+精度が悪い分類器を、データに重み付けをした上で更新 
 `Run(QBoost:step 2)`ボタンで実行

## データセットについて


今回使用するデータセットは、`scikit-learn`に含まれている`iris_dataset`です。`iris_dataset`とは、計3種類のアヤメとそれに対応する4つの特徴量（花弁の長さ・幅、ガクの長さ・幅）が含まれたデータセットであり、この特徴量を用いてアヤメの分類を行うことが今回の目標です。つまり、教師あり学習による3クラス分類に取り組みます。このデータセットには150個分のアヤメのデータが含まれています。

以下が各種類のアヤメの写真です(wikipediaから引用)。パッと見ただけではほとんど違いがわからないと思います。
![](./iris_pictures.png)

In [None]:
import itertools
import numpy as np
import pandas as pd
import scipy.stats as stats

from collections import defaultdict

from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC

from matplotlib import pyplot as plt

from amplify.client import FixstarsClient
from amplify import BinarySymbolGenerator, Solver, decode_solution

from IPython.display import display, clear_output
from ipywidgets import (
    Button,
    IntSlider,
    interactive_output,
    VBox,
    HBox,
    Output,
    Label,
    Accordion,
    IntProgress,
    GridBox,
    GridspecLayout,
)

client = FixstarsClient()
client.parameters.timeout = 1000

In [None]:
def init(_num_classifiers=100, _timeout=1000):
    global iris_dataset
    global timeout
    global classifiers
    global num_classifiers
    global num_samples
    global num_classes
    global depth
    global X_train
    global X_test
    global X_valid
    global y_train
    global y_test
    global y_valid
    global y_train_tmp
    global y_test_tmp
    global y_valid_tmp
    global test_predictions_of_classifiers
    global valid_predictions_of_classifiers
    global results

    results = {
        "多数決": defaultdict(),
        "QBoost(step 1)": defaultdict(),
        "QBoost(step 2)": defaultdict(),
    }  # 結果保存用

    timeout = _timeout
    iris_dataset = load_iris()
    data, label = np.array(iris_dataset["data"]), np.array(iris_dataset["target"])
    num_classes = len(set(label))
    if num_classes == 2:  # データセットに含まれるラベルの種類がふたつなら、分類器は１クラス分あれば良いので調整
        num_classes = 1

    X_train, X_test, y_train_tmp, y_test_tmp = train_test_split(
        data, label, test_size=0.2, random_state=0
    )

    X_train, X_valid, y_train_tmp, y_valid_tmp = train_test_split(
        X_train, y_train_tmp, test_size=0.25, random_state=0
    )

    y_train = np.full((num_classes, len(X_train)), -1)
    y_test = np.full((num_classes, len(X_test)), -1)
    y_valid = np.full((num_classes, len(X_valid)), -1)

    def updateLabel(y, tmp):
        for i, label in enumerate(tmp):
            y[label][i] = 1

    updateLabel(y_train, y_train_tmp)
    updateLabel(y_test, y_test_tmp)
    updateLabel(y_valid, y_valid_tmp)

    # fix random seed
    seed = 0
    np.random.seed(seed)

    # Create weak classifiers
    num_classifiers = _num_classifiers
    num_samples = 5
    depth = 1
    classifiers = [
        [DTC(splitter="random", max_depth=depth) for _ in range(num_classes)]
        for _ in range(num_classifiers)
    ]

    # Train weak classifiers

    for classifier in classifiers:
        for i in range(num_classes):
            sample_indices = np.random.choice(np.arange(X_train.shape[0]), num_samples)
            classifier[i].fit(X=X_train[sample_indices], y=y_train[i, sample_indices])

    # Predict & Majority vote
    test_predictions_of_classifiers = np.array(
        [
            [classifier[i].predict(X_test) for i in range(num_classes)]
            for classifier in classifiers
        ]
    )
    valid_predictions_of_classifiers = np.array(
        [
            [classifier[i].predict(X_valid) for i in range(num_classes)]
            for classifier in classifiers
        ]
    )
    print(f"number of classifiers {num_classifiers * num_classes}")

In [None]:
def plot_features(X, Y, fig=None, title=None):
    global iris_dataset
    feature_names = iris_dataset.feature_names
    if fig is None:
        fig = plt.figure(figsize=(12, 8))
    plt.gcf()
    # 二次元のグラフを作りたいので特徴量の組み合わせを作る
    for i, (x, y) in enumerate(itertools.combinations(range(4), 2)):
        # サブグラフ
        plt.subplot(2, 3, i + 1)
        # 各品種はマーカーの色や形を変える
        for t, marker, c in zip(range(3), ">ox", "rgb"):
            plt.scatter(
                X[Y == t, x],
                X[Y == t, y],
                marker=marker,
                c=c,
                label=iris_dataset.target_names[t],
            )
            plt.xlabel(feature_names[x])
            plt.ylabel(feature_names[y])
    plt.autoscale()
    plt.legend()
    if title:
        plt.title(title)
        plt.show()

In [None]:
def predict_majority(Progress):
    global classifiers
    global X_train
    global X_test
    global X_valid
    global y_train
    global y_test
    global y_valid
    global y_train_tmp
    global y_test_tmp
    global y_valid_tmp
    global test_predictions_of_classifiers
    global valid_predictions_of_classifiers

    print(f"Total classifiers: {num_classifiers * num_classes}")
    Progress.value += 1
    for i in range(num_classes):
        predictions_vote = np.sign(
            np.sum(test_predictions_of_classifiers[:, i, :], axis=0)
        )
        # Calculate accuracy
        accuracy = metrics.accuracy_score(y_true=y_test[i], y_pred=predictions_vote)
        print(f"Majority vote of weak classifiers class {i}: {accuracy*100} %")
        results["多数決"][f"class {i}"] = accuracy * 100
        Progress.value += 2

    if num_classes > 2:
        tmp = np.sum(test_predictions_of_classifiers, axis=0)
        m_vote = np.argmax(tmp, axis=0)
        accuracy = metrics.accuracy_score(y_true=y_test_tmp, y_pred=m_vote)
        print(f"Majority vote of weak classifiers: {accuracy*100} %")
        results["多数決"]["Total"] = accuracy * 100
        results["多数決"]["num classifiers"] = num_classes * num_classifiers
        Progress.value += 2
        plot_features(X_test, np.array(m_vote), title="Majority vote")

In [None]:
def preprocess(classifiers):
    """最適化に必要な変数の定義と学習データに対する分類器の出力を計算"""
    # Prepare spins
    gen = BinarySymbolGenerator()
    spins = gen.array(shape=(num_classifiers, num_classes))

    # Obtain predictions for train data
    train_predictions_of_classfiers = np.array(
        [
            [classifier[i].predict(X_train) for i in range(num_classes)]
            for classifier in classifiers
        ]
    )
    return spins, train_predictions_of_classfiers

In [None]:
def createQUBO(spins, penalty, train_predictions_of_classifiers):
    """QUBOとして最適化する関数を定義"""
    # 各データ、各クラスラベルに対応する分類器について、分類器の総和をとる。
    tmp = (np.expand_dims(spins, -1) * train_predictions_of_classifiers).sum(0)
    tmp = (y_train * tmp - 1) ** 2
    f = tmp.sum()

    f += penalty * spins.sum()
    return f

In [None]:
def solve(f, spins, solver=None):
    """Amplify AEを用いて最適化"""
    if solver is None:
        client = FixstarsClient()
        client.parameters.timeout = 1000
        solver = Solver(client)
    # Solve QUBO formulation
    result = solver.solve(f)
    solution = decode_solution(spins, result[0].values)
    # 最適化結果の取り出し
    use_indices = [
        np.where(solution[:, l] == 1)[0].tolist() for l in range(num_classes)
    ]
    return use_indices

In [None]:
def compute_accuracy(use_indices, predictions_of_classifiers, y_true):
    """与えられたインデックスに対応する分類器による推論結果と正解率を計算"""
    model_predictions = []
    accuracy_each_class = []
    ACCEPT = True
    for i in range(num_classes):
        if len(use_indices[i]) == 0:  # 全く使われないクラスの分類器がある場合、結果を採用しない
            ACCEPT = False
            break
        model_predictions.append(
            np.sum(predictions_of_classifiers[use_indices[i]][:, i, :], axis=0)
        )
        accuracy = metrics.accuracy_score(
            y_true=y_true[i], y_pred=list(map(np.sign, model_predictions[i]))
        )
        accuracy_each_class.append(accuracy)
    return np.array(model_predictions), np.array(accuracy_each_class), ACCEPT

In [None]:
def model_output(model_predictions, valid_acc, y_true):
    """各ラベルに対応する分類器の予測の多数決をとり、最終的な出力と正解率を計算"""
    m_vote = np.argmax(model_predictions, axis=0)
    accuracy = metrics.accuracy_score(y_true=y_true, y_pred=m_vote)
    return np.array(m_vote), accuracy

In [None]:
def update_parameters(
    d_inner,
    best_idxs,
    use_idxs,
    classifiers,
    train_predictions_of_classifiers,
    valid_predictions_of_classifiers,
    test_predictions_of_classifiers,
):
    """重み付けパラメーターdの更新と、最適化の結果使用しないとされた弱分類器の更新"""
    # 重み付けパラメーターdを更新
    for l in range(num_classes):
        d_broadcast = np.broadcast_to(
            d_inner[:, l], (len(best_idxs[l]), len(d_inner[:, l]))
        ).copy()
        tmp = (
            y_train[l]
            * np.sum(
                d_broadcast * train_predictions_of_classifiers[best_idxs[l]][:, l, :],
                axis=0,
            )
            - 1
        ) ** 2
        d_inner[:, l] *= tmp
        if np.sum(d_inner[:, l]) == 0:
            continue
        d_inner[:, l] /= np.sum(d_inner[:, l])

    # Update classifier dictionary
    # 選択されていない分類器を、データに重み付けをした上で更新する
    tmp = np.array(np.nonzero(use_idxs))
    for c in range(tmp.shape[-1]):
        i, l = tmp[:, c]
        classifier = classifiers[l][i]
        sample_indices = np.random.choice(np.arange(X_train.shape[0]), num_samples)
        classifier.fit(
            X=X_train[sample_indices],
            y=y_train[i, sample_indices],
            sample_weight=d_inner[sample_indices, i],
        )
        test_predictions_of_classifiers[l][i] = classifier.predict(X_test)
        valid_predictions_of_classifiers[l][i] = classifier.predict(X_valid)

    return (
        d_inner,
        classifiers,
        valid_predictions_of_classifiers,
        test_predictions_of_classifiers,
    )

In [None]:
def simple_QUBO(Progress):
    global timeout
    global test_predictions_of_classifiers
    global valid_predictions_of_classifiers
    penalty = len(X_train) / num_classes / num_classifiers  # ペナルティ項の係数。今回は決め打ち
    # Obtain predictions for train data　and prepare spins
    spins, train_predictions_of_classifiers = preprocess(classifiers)
    # Create QUBO formulation
    f = createQUBO(spins, penalty, train_predictions_of_classifiers)
    use_indices = solve(f, spins)
    Progress.value += 1
    return use_indices

In [None]:
def predict_simpleQUBO(use_indices):
    # バリデーションデータに対する正解率を計算
    valid_pred, valid_acc, _ = compute_accuracy(
        use_indices, valid_predictions_of_classifiers, y_valid
    )
    # テストデータに対する正解率を計算
    test_pred, test_acc, _ = compute_accuracy(
        use_indices, test_predictions_of_classifiers, y_test
    )
    for i in range(len(test_acc)):
        print(f"QBoost(step 1) class {i}: {test_acc[i]*100} %")
        results["QBoost(step 1)"][f"class {i}"] = test_acc[i] * 100

    if num_classes > 2:
        m_vote, accuracy = model_output(test_pred, valid_acc, y_test_tmp)
        print(f"QBoost(step 1): {accuracy*100} %")
        results["QBoost(step 1)"]["Total"] = accuracy * 100
        results["QBoost(step 1)"]["num classifiers"] = sum(
            len(use_indices[i]) for i in range(num_classes)
        )
        plot_features(X_test, np.array(m_vote), title="QBoost(step 1)")

In [None]:
def qboost(Progress):
    global classifiers
    global test_predictions_of_classifiers
    global valid_predictions_of_classifiers
    penalty = len(X_train) / num_classes / num_classifiers

    MAX_ITER = 1000
    EPS = 1e-8

    client = FixstarsClient()
    client.parameters.timeout = 500
    solver = Solver(client)

    prev_acc = -1
    best_acc = -1
    best_idxs = None
    best_T = -1
    best_lam = None

    # initialization
    d_inner = np.full((len(X_train), num_classes), 1 / len(X_train))
    T_inner = 0
    lam = np.arange(0.001, penalty / 10, 0.01)
    use_idxs = np.ones((num_classes, num_classifiers))
    Progress.value += 1

    # fix seed
    seed = 0
    np.random.seed(seed)

    for _iter in range(MAX_ITER):
        assert (
            abs(np.sum(d_inner, axis=0) - 1) < EPS
        ).all()  # 重み付け用のパラメーターがきちんとスケーリングされているかチェック
        for penalty in lam:
            print("iteration :", _iter, "lambda =", penalty)
            print(f"Best : {best_acc*100}%")
            # 変数の用意・学習データに対する推論結果の計算
            spins, train_predictions_of_classifiers = preprocess(classifiers)
            # QUBOモデルの作成
            f = createQUBO(spins, penalty, train_predictions_of_classifiers)
            # 求解・解の取り出し
            temporal_idxs = solve(f, spins)
            # クラスラベルごとの正解率を計算
            predictions_qboost, valid_acc, accept = compute_accuracy(
                temporal_idxs, valid_predictions_of_classifiers, y_valid
            )
            if not accept:  # 最適化の結果、あるクラスの分類器を使わないことが最適とされた場合は処理を終了し次のループへ進む
                continue
            if num_classes > 2:  # 統合した結果を計算
                m_vote, accuracy = model_output(
                    predictions_qboost, valid_acc, y_valid_tmp
                )

            if (accuracy >= best_acc).all():  # 正解率が向上した場合、得られた解を最良解とする
                best_acc = accuracy
                best_idxs = temporal_idxs
                best_T = sum(len(temporal_idxs[i]) for i in range(num_classes))
                for i in range(num_classes):
                    use_idxs[i, best_idxs[i]] = 0
                    best_lam = penalty

        if (prev_acc >= best_acc).all():  # 最良解の更新が止まったタイミングで処理を終了する
            print(
                "Finish!",
                f"Best acculacy = {best_acc*100}%, number of classifiers = {best_T}, best lambda = {best_lam}",
            )
            break

        prev_acc = best_acc

        # 重みづけパラメーターと、未使用の分類器を更新
        (
            d_inner,
            classifiers,
            valid_predictions_of_classifiers,
            test_predictions_of_classifiers,
        ) = update_parameters(
            d_inner,
            best_idxs,
            use_idxs,
            classifiers,
            train_predictions_of_classifiers,
            valid_predictions_of_classifiers,
            test_predictions_of_classifiers,
        )
        Progress.value += 1
        Progress.value += 1
        clear_output()
    clear_output()
    return best_idxs, best_T

In [None]:
def predict_qboost(best_idxs, best_T):
    global test_predictions_of_classifiers
    global valid_predictions_of_classifiers
    valid_pred, valid_acc, _ = compute_accuracy(
        best_idxs, valid_predictions_of_classifiers, y_valid
    )
    test_pred, test_acc, _ = compute_accuracy(
        best_idxs, test_predictions_of_classifiers, y_test
    )

    for i in range(num_classes):
        print(f"QBoost(step 2) class {i}: {test_acc[i]*100} %")
        results["QBoost(step 2)"][f"class {i}"] = test_acc[i] * 100
    print(f"Total classifiers: {best_T}")

    if num_classes > 2:
        m_vote, accuracy = model_output(test_pred, valid_acc, y_test_tmp)
        print(f"QBoost(step 2): {accuracy*100} %")
        results["QBoost(step 2)"]["Total"] = accuracy * 100
        results["QBoost(step 2)"]["num classifiers"] = best_T
        plot_features(X_test, np.array(m_vote), title="QBoost(step 2)")

In [None]:
classifier_slider = IntSlider(
    value=80,
    min=0,
    max=300,
    step=1,
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)
time_slider = IntSlider(
    value=1500,
    min=100,
    max=5000,
    step=50,
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)


options1 = [Label(value="クラスごとの弱分類器の数:"), classifier_slider]

options2 = [Label(value="制限時間 [ ms ] :"), time_slider]

options = [GridBox(options1), GridBox(options2)]
options = Accordion(children=[HBox(options)])
options.set_title(0, "Options")
options.selected_index = None

Progress = IntProgress(
    value=0,
    min=0,
    max=7,
    step=1,
    description="Solving...",
    bar_style="info",
    orientation="horizontal",
)
qboost_run_btn = Button(
    description="Run(QBoost:step 2)",
    button_style="",
    tooltip="Run(QBoost:step 2)",
    icon="check",
)

simpleQUBO_run_btn = Button(
    description="Run(QBoost:step 1)",
    button_style="",
    tooltip="Run(QBoost:step 1)",
    icon="check",
)

majority_run_btn = Button(
    description="Run(Majority of vote)",
    button_style="",
    tooltip="Run(Majority of vote)",
    icon="check",
)


Solve_out = Output()
Problem_out = Output()


def show_Iris_problem(num_classifiers, timeout, seed=0):
    global Progress
    Progress.value = 0
    init(num_classifiers, timeout)
    plot_features(X_test, y_test_tmp, title="Ground Truth")


def display_table():
    with Problem_out:
        Problem_out.clear_output()
        df = pd.DataFrame(results)
        display("Results")
        display(df.T)
        plot_features(X_test, y_test_tmp, title="Ground Truth")


def show_Majority_result(btn):
    global Progress
    Progress.value = 0
    with Solve_out:
        Progress.value += 1
        Solve_out.clear_output()
        Progress.value += 1
        display("Computation log")
        predict_majority(Progress)
        Progress.value += 10
        display_table()


def show_Simple_result(btn):
    global Progress
    Progress.value = 0
    with Solve_out:
        Progress.value += 1
        Solve_out.clear_output()
        Progress.value += 1
        display("Computation log")
        solution = simple_QUBO(Progress)
        Progress.value += 1
        predict_simpleQUBO(solution)
        Progress.value += 10
        display_table()


def show_QBoost_result(btn):
    global Progress
    Progress.value = 0
    with Solve_out:
        Progress.value += 1
        Solve_out.clear_output()
        Progress.value += 1
        display("Computation log")
        best_idxs, best_T = qboost(Progress)
        Progress.value += 1
        display("Computation log")
        predict_qboost(best_idxs, best_T)
        Progress.value += 10
        display_table()


Problem_out = interactive_output(
    show_Iris_problem,
    {
        "num_classifiers": classifier_slider,
        "timeout": time_slider,
    },
)

Solve_out = interactive_output(
    show_Iris_problem,
    {
        "num_classifiers": classifier_slider,
        "timeout": time_slider,
    },
)

majority_run_btn.on_click(show_Majority_result)
simpleQUBO_run_btn.on_click(show_Simple_result)
qboost_run_btn.on_click(show_QBoost_result)

grid = GridspecLayout(10, 12)
right = 3

grid[1:, :right] = Problem_out
grid[0, 0] = majority_run_btn
grid[0, 1] = simpleQUBO_run_btn
grid[0, 2] = qboost_run_btn
grid[0, 3] = Progress
grid[1:, right:] = Solve_out


display(VBox([options, grid]))