# Testing k-means initialization

In [18]:
import pinot
import numpy as np
import matplotlib.pyplot
import pandas as pd
import seaborn
import torch

In [57]:
# Running functions
import argparse

parser = argparse.ArgumentParser("HTS supervised learning")

parser.add_argument(
    '--regressor_type', 
    type=str,
    default='gp',
    choices=["gp", "nn", "vgp"],
    help="Type of output regressor, Gaussian Process, Variational GP or Neural Networks"
)
parser.add_argument(
    '--lr',
    type=float,
    default=1e-4,
    help="learning rate of optimizer"
)
parser.add_argument(
    '--optimizer',
    type=str,
    default='Adam',
    help="Optimization algorithm"
)
parser.add_argument(
    '--data',
    type=str,
    default="mpro_hts",
    help="Labeled data set name"
)

parser.add_argument(
    '--n_epochs',
    type=int,
    default=500,
    help="number of training epochs"
)
parser.add_argument(
    '--architecture',
    nargs="+",
    type=str,
    default=[32, "tanh", 32, "tanh", 32, "tanh"],
    help="Graph neural network architecture"
)
parser.add_argument(
    '--cuda',
    action="store_true",
    default=False,
    help="Using GPU"
)
parser.add_argument(
    '--output',
    type=str,
    default="out",
    help="Name of folder to store results"
)
parser.add_argument(
    '--time_limit',
    type=str,
    default="200:00",
    help="Limit on training time. Format is hour:minute."
)
parser.add_argument(
    '--log',
    type=str,
    default="logs",
    help="Log file"
)

parser.add_argument(
    '--weight_decay',
    default=0.01,
    type=float,
    help="Weight decay for optimizer",
)

parser.add_argument(
    '--batch_size',
    default=32,
    type=int,
    help="Batch size"
)

parser.add_argument(
    '--sample_frac',
    nargs="+",
    type=float,
    default=0.005, # 0.1
    help="Proportion of dataset to use"
)

parser.add_argument(
    '--label_split',
    nargs="+",
    type=list,
    default=[4, 1],
    help="Training-testing split for labeled data"
)
parser.add_argument(
    '--index',
    type=int,
    default=1,
    help="Arbitrary index to append to logs"
)
parser.add_argument(
    '--annealing',
    type=float,
    default=1.0,
    help="Scaling factor on the KL term in the variational inference loss"
)
parser.add_argument(
    '--n_inducing_points',
    type=int,
    default=100,
    help="Number of inducing points to use for variational inference"
)
parser.add_argument(
    '--record_interval',
    type=int,
    default=50,
    help="Number of intervals before recording metrics"
)
parser.add_argument(
    '--normalize',
    type=int,
    default=0,
    help="Number of inducing points to use for variational inference"
)
parser.add_argument(
    '--fix_seed',
    action="store_true",
    default=False,
    help="Whether to fix random seed"
)
parser.add_argument(
    '--filter_outliers',
    action="store_true",
    default=False,
    help="Whether to filter huge outliers."
)
parser.add_argument(
    '--filter_neg_train',
    action="store_true",
    default=False,
    help="Whether to filter negatives in the training set."
)
parser.add_argument(
    '--filter_neg_test',
    action="store_true",
    default=False,
    help="Whether to filter negatives in the testing set."
)
parser.add_argument(
    '--seed',
    type=int,
    default=0,
    help="Setting the seed for random sampling"
)
parser.add_argument(
    '--filter_threshold',
    type=float,
    default=-2.0, # 0.1
    help="Proportion of dataset to use"
)
parser.add_argument(
    '--mu_mean',
    type=float,
    default=0.0,
    help=""
)
parser.add_argument(
    '--mu_std',
    type=float,
    default=0.1,
    help="Epoch of training curve for pretrained representation; -1 means no pretraining"
)
parser.add_argument(
    '--std_value',
    type=float,
    default=-2,
    help="Epoch of training curve for pretrained representation; -1 means no pretraining"
)
parser.add_argument(
    '--initialize_k_means',
    type=int,
    default=1,
    help="Sets inducing points with k-means if equal to 1"
)

_StoreAction(option_strings=['--initialize_k_means'], dest='initialize_k_means', nargs=None, const=None, default=1, type=<class 'int'>, choices=None, help='Sets inducing points with k-means if equal to 1', metavar=None)

In [59]:
args = vars(parser.parse_args([]))

In [61]:
# Only default values
args = vars(parser.parse_args([]))
args['regressor_type'] = 'vgp'
args['sample_frac'] = 0.1
args['n_epochs'] = 350
args['filter_neg_train'] = True
args['filter_neg_test'] = True

# make dotdict
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
args = dotdict(args)

Set up network.

In [62]:
# Specify accelerator (if any)
device = torch.device("cuda:0" if args.cuda else "cpu:0")

layer_type = args.architecture[0]
n_layers = len(args.architecture) // 4
n_units = args.architecture[1]
activation = args.architecture[3]

seed = args.seed if args.fix_seed else None
savefile = (f'reg={args.regressor_type}_a={n_layers}x_{n_units}x'
            f'_{layer_type}_{activation}_n={args.n_epochs}_b={args.batch_size}'
            f'_wd={args.weight_decay}_lsp={args.label_split[0]}_frac={args.sample_frac}'
            f'_anneal={args.annealing}_induce={args.n_inducing_points}_normalize={args.normalize}'
            f'_{args.index}_seed={seed}_filterthreshold={args.filter_threshold}'
            f'_mumean={args.mu_mean}_mustd={args.mu_std}_stdvalue={args.std_value}'
            f'_filter_neg_train={args.filter_neg_train}_filter_neg_test={args.filter_neg_test}')

Get data.

In [31]:
# otherwise, load from scratch
data = getattr(pinot.data, args.data)(sample_frac=args.sample_frac)



Process data.

In [34]:
# move to cuda
data = data.to(device)

# filter out huge outliers
if args.filter_outliers:
    data.ds = list(filter(lambda x: x[1] > args.filter_threshold, data))

# Split the labeled moonshot data into training set and test set
train_data, test_data = data.split(args.label_split, seed=seed)

if args.filter_neg_train:
    train_data.ds = list(filter(lambda x: x[1] > 0.0, train_data))

if args.filter_neg_test:
    test_data.ds = list(filter(lambda x: x[1] > 0.0, test_data))


# Normalize training data using train mean and train std
if args.normalize:
    gs, ys_tr = zip(*train_data.ds)
    ys_tr = torch.cat(ys_tr).reshape(-1, 1)
    mean_tr, std_tr = ys_tr.mean(), ys_tr.std()
    ys_norm_tr = (ys_tr - mean_tr)/std_tr
    train_data.ds = list(zip(gs, ys_norm_tr))

    # Normalize testing data using train mean and train std
    gs, ys_te = zip(*test_data.ds)
    ys_te = torch.cat(ys_te).reshape(-1, 1)
    ys_norm_te = (ys_te - mean_tr)/std_tr
    test_data.ds = list(zip(gs, ys_norm_te))


# Set batch size and log
batch_size = args.batch_size if args.regressor_type != 'gp' else len(train_data)

Initialize network.

In [40]:
def get_net_and_optimizer(args):
    """

    """
    representation = pinot.representation.sequential.SequentialMix(
        args.architecture,
    )

    if args.regressor_type == "gp":
        output_regressor = pinot.regressors.ExactGaussianProcessRegressor
    elif args.regressor_type == "nn":
        output_regressor = pinot.regressors.NeuralNetworkRegressor 
    else:
        output_regressor = pinot.regressors.VariationalGaussianProcessRegressor

    # First train a fully supervised Net to use as Baseline
    net = pinot.Net(
        representation=representation,
        output_regressor_class=output_regressor,
        n_inducing_points=args.n_inducing_points
    )
    optimizer = pinot.app.utils.optimizer_translation(
        opt_string=args.optimizer,
        lr=args.lr,
        weight_decay=args.weight_decay,
    )
    net.to(device)
    return net, optimizer(net)

sup_net, optimizer = get_net_and_optimizer(args)

In [63]:
# inducing points!
sup_net.output_regressor.x_tr

Parameter containing:
tensor([[-0.7116, -0.8450, -0.9564,  ...,  0.7036,  0.7349, -0.9047],
        [ 0.6489, -0.9565,  0.3722,  ..., -0.9015,  0.2913,  0.7152],
        [ 0.3899, -0.9063,  0.5909,  ...,  0.6339,  0.6003,  0.1481],
        ...,
        [-0.5034,  0.6223, -0.7590,  ..., -0.3028,  0.7848, -0.8143],
        [ 0.2797, -0.1403, -0.8969,  ..., -0.1860, -0.8857, -0.3679],
        [-0.0528, -0.6206, -0.4435,  ...,  0.4205, -0.3902,  0.0913]],
       requires_grad=True)

In [None]:
def _initial_values_for_GP(train_dataset, feature_extractor, n_inducing_points):
    """ Assumes that both dataset and feature extractor
        are either cuda or not cuda.
        Also assumes the train_dataset is unbatched
    """
    steps = 10
    indices = torch.randperm(len(train_dataset))[:1000].chunk(steps)
    f_X_samples = []

    with torch.no_grad():
        for i in range(steps):
            f_X_sample = torch.cat([
                feature_extractor(train_dataset[j.item()][0])
                for j in indices[i]
            ])
            f_X_samples.append(f_X_sample)

    return torch.cat(f_X_samples)

def _get_kmeans(f_X_sample, n_inducing_points):
    """ Get k means for multidimensional input.
    """
    kmeans = cluster.MiniBatchKMeans(
        n_clusters=n_inducing_points, batch_size=n_inducing_points * 10
    )
    kmeans.fit(f_X_sample.cpu().numpy())
    cluster_centers = torch.from_numpy(kmeans.cluster_centers_)

    return cluster_centers

def initialize_inducing_points(train_dataset, feature_extractor, n_inducing_points):
    """ Get initial inducing points for variational GP model.
    """
    f_X_sample = _initial_values_for_GP(
        train_dataset,
        feature_extractor,
        n_inducing_points
    )

    initial_inducing_points = _get_kmeans(f_X_sample, n_inducing_points)
    return initial_inducing_points 

Set inducing points using k-means of output from regressor.

In [65]:
init_induce_points = pinot.app.utils.initialize_inducing_points(
    train_dataset = train_data,
    feature_extractor = sup_net.representation,
    n_inducing_points = sup_net.output_regressor.n_inducing_points
)



In [68]:
init_induce_points

tensor([[  7.1233, -19.1180, -11.8646,  ...,   8.5274,   1.2081,  -6.4313],
        [  9.8710, -24.8199, -16.1313,  ...,  10.8500,   4.6070,  -7.0693],
        [  5.0331, -13.6795,  -9.0649,  ...,   6.7591,   1.2434,  -4.4143],
        ...,
        [  7.8042, -19.8476, -12.8215,  ...,   8.7871,   3.2169,  -5.8018],
        [  9.4224, -24.2821, -16.0787,  ...,  10.1795,   5.5139,  -7.4783],
        [  7.1576, -18.9278, -13.0275,  ...,   9.6793,   2.7891,  -5.5595]])

In [76]:
init_induce_points

tensor([[  8.6639, -23.6054, -15.4143,  ...,  11.5641,   2.2843,  -7.3783],
        [  5.6081, -15.6754, -10.3079,  ...,   8.1157,   0.4025,  -5.3210],
        [  7.8686, -20.1883, -12.5105,  ...,   8.4455,   2.7434,  -5.9046],
        ...,
        [  6.9526, -18.0050, -11.6376,  ...,   7.3911,   3.5004,  -5.6421],
        [ 11.4527, -27.4108, -18.0672,  ...,  10.5639,   6.9197,  -8.0093],
        [  7.0987, -19.2762, -12.9772,  ...,  10.0077,   1.7761,  -5.9514]])

In [69]:
# if the net regressor is a VGP
# set inducing points using k-means
# before mini-batching
if isinstance(
    sup_net.output_regressor,
    pinot.regressors.VariationalGaussianProcessRegressor
) and args.initialize_k_means:

    init_induce_points = pinot.app.utils.initialize_inducing_points(
        train_dataset = train_data,
        feature_extractor = sup_net.representation,
        n_inducing_points = sup_net.output_regressor.n_inducing_points
    )

    sup_net.output_regressor.x_tr = torch.nn.Parameter(
        init_induce_points
    )
    sup_net.to(device)


# mini-batch if we're using variational GP
train_data = train_data.batch(batch_size)

# get results
metrics = [
    pinot.pearsonr,
    pinot.absolute_error,
    pinot.y_hat,
    pinot.rmse,
    pinot.r2,
    pinot.avg_nll
]



NameError: name 'time' is not defined

In [None]:
steps = 10
indices = torch.randperm(len(train_dataset))[:1000].chunk(steps)
f_X_samples = []

with torch.no_grad():
    for i in range(steps):
        f_X_sample = torch.cat([
            feature_extractor(train_dataset[j.item()][0])
            for j in indices[i]
        ])
        f_X_samples.append(f_X_sample)