In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
    
from collections import Counter, defaultdict

from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import adjusted_rand_score
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import helpers_preorder as hp
import helpers_datasets as hd
import pandas as pd
from scipy.stats import sem
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

BASELINE = "baseline"
KAN = "kan"

In [None]:
def get_fit_kan_classifier(n_estimators: int, Xtr: np.ndarray, ytr: np.ndarray):
    params = {
        "f": None,
        "verbose": False,
        "get_f_learner": lambda X: hp.ClassifierProbabilityLearner(clf_class=DecisionTreeClassifier),
    #     "get_f_learner": lambda X: hp.ClassifierProbabilityLearner(clf_class=RandomForestClassifier),
    # #     "get_f_learner": lambda X: hp.LinearOrderingLossLearner(
    # #         learning_rate=0.02,
    # #         output_dimension=10,
    # #         num_columns=X.shape[-1],
    # #         epochs=1000,
    # #         batches_per_epoch=1,
    # #         verbose=False),
    #     "get_f_learner": lambda X: hp.NetworkOrderingLossLearner(
    #         num_layers=0,
    #         learning_rate=0.02,
    #         output_dimension=10,
    #         epochs=1000,
    #         batches_per_epoch=1,
    #         verbose=False),
        "get_kind": lambda: np.random.choice([hp.PreorderClassifier.RAN, hp.PreorderClassifier.LAN])
    }
    clf = BaggingClassifier(
        hp.PreorderClassifier(**params),
        n_estimators=n_estimators,
        max_features=int(Xtr.shape[1] * 0.25),
        n_jobs=10
    )
    clf.estimator_params = list(params.keys())
    for k, v in params.items():
        setattr(clf, k, v)
    clf.fit(Xtr, ytr)
    return clf


def get_fit_rf_classifier(n_estimators: int, Xtr: np.ndarray, ytr: np.ndarray):
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(Xtr, ytr)
    return clf


def get_experiment_results(Xtr: np.ndarray, ytr: np.ndarray, Xte: np.ndarray, yte: np.ndarray):
    results = defaultdict(lambda: defaultdict(float))
    for n_estimators in [10, 50, 100]:
        for model_name, get_fit_model in [(BASELINE, get_fit_rf_classifier), (KAN, get_fit_kan_classifier)]:
            model_key = "{}_{}".format(model_name, n_estimators)
            print(model_key)
            clf = get_fit_model(n_estimators=n_estimators, Xtr=Xtr, ytr=ytr)
            predictions = clf.predict(Xte)
            results[model_key]["tpr"] = hp.true_positive_rate(yte, predictions)
            results[model_key]["tnr"] = hp.true_negative_rate(yte, predictions)
            results[model_key]["roc_auc"] = roc_auc_score(yte, predictions)
    return results




In [None]:

included_classes = {0:False, 6:True} # Tshirt = 0, Shirt = 6
X_mnist_train, y_mnist_train_raw, X_mnist_test, y_mnist_test_raw = hd.get_mnist_dataset(
    included_classes=included_classes.keys(), num_train_images=-1, num_test_images=-1)
y_mnist_train = np.array([included_classes[y] for y in y_mnist_train_raw])
y_mnist_test = np.array([included_classes[y] for y in y_mnist_test_raw])

mnist_results = get_experiment_results(Xtr=X_mnist_train, ytr=y_mnist_train, Xte=X_mnist_test, yte=y_mnist_test)
pd.DataFrame(mnist_results)