In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import compare
from tensorflow.keras.datasets import mnist
from visual_bars.generate_visual_bars_data import VisualBarsData
from cfl.util.data_processing import one_hot_encode
from cfl.experiment import Experiment
from sklearn import datasets
from tqdm import tqdm

In [2]:
# paths
base_path = '/Users/imanwahle/Desktop/cfl/cfl/cluster_methods/compare_methods'
TAG = ''
DATA_PATH = os.path.join(base_path, 'data_' + TAG)
RESULTS_PATH = os.path.join(base_path, 'results_' + TAG)
FIG_PATH = os.path.join(base_path, 'figures_' + TAG)
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)
if not os.path.exists(RESULTS_PATH):
    os.makedirs(RESULTS_PATH)
if not os.path.exists(FIG_PATH):
    os.makedirs(FIG_PATH)

In [3]:
# constants
n_samples = 5000
random_state = 42
data_names = ['blobs_const', 'blobs_vard', 'mnist', 'vb_noise0.0', 'vb_noise0.1']


In [4]:

# # CDE params for vis_bars datasets
# CNN_params = { # parameters for model creation
#                     'filters'         : [32, 16],
#                     'input_shape'     : (10, 10, 1),
#                     'kernel_size'     : [(3, 3)] *2,
#                     'pool_size'       : [(2, 2)] *2,
#                     'padding'         : ['same'] *2,
#                     'conv_activation' : ['softmax', 'softmax'],
#                     'dense_units'     : 16,
#                     'dense_activation' : 'softmax',
#                     'output_activation': 'softmax',

#                     # parameters for training
#                     'batch_size'  : 32,
#                     'n_epochs'    : 40,
#                     'optimizer'   : 'adam',
#                     'opt_config'  : {},
#                     'verbose'     : 2,
#                     'weights_path': None,
#                     'loss'        : 'mean_squared_error',
#                     'show_plot'   : True,
#                     'standardize' : False,
#                     'best'        : True,
#                 }

In [17]:
# helper functions

def save_data(dataset, data_name):
    os.mkdir(os.path.join(DATA_PATH, data_name))
    np.save(os.path.join(DATA_PATH, data_name, 'data_to_cluster.npy'), dataset[0])
    np.save(os.path.join(DATA_PATH, data_name, 'true_labels.npy'), dataset[1])

def visualize_data(dataset, data_name):
    # get embedding
    if dataset[0].shape[1] > 2:
        embedding = compare.get_embedding(DATA_PATH, data_name)
    else:
        embedding = dataset[0]

    # make subplot
    fig,ax = plt.subplots()
    if (embedding.shape[1]==1) or (np.sum(embedding)==embedding.shape[0]):
        compare._hist_helper(ax, embedding, dataset[1], data_name, subscript=None)
    else:
        compare._scatter_helper(ax, embedding, dataset[1], data_name, subscript=None)

    plt.show()

def get_vb_data(n_samples, noise_lvl, random_state):
    vb_data = VisualBarsData(n_samples=n_samples, noise_lvl=noise_lvl, set_random_seed=random_state)
    X = vb_data.getImages()
    pyx_gt = vb_data.getGroundTruth()
    Y = vb_data.getTarget()
    return X, pyx_gt, Y

In [6]:
# The following blocks are written as stand-alone processes to generate each dataset. 
# Only run the ones you want.

In [7]:
# # uniform variance blobs

# data = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
# save_data(data, data_names[0])
# visualize_data(data, data_names[0])

In [8]:
# # varied variance blobs

# data = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5],  random_state=random_state)
# save_data(data, data_names[1])
# visualize_data(data, data_names[1])

In [9]:
# # mnist digits

# (train_X, train_y), (test_X, test_y) = mnist.load_data()
# train_X = np.reshape(train_X, (train_X.shape[0], train_X.shape[1]*train_X.shape[2]))

# # we only want n_samples points from mnist for now
# np.random.seed(random_state)
# idx = np.random.choice(range(train_X.shape[0]), n_samples, replace=False)
# data = [train_X[idx,:], train_y[idx,]]

# save_data(data, data_names[2])
# visualize_data(data, data_names[2])

In [10]:
# # causal mnist setup
# from cfl import util
# import random

# # Generate target (Y) data. Here 'a' and 'b' are represent alpha and beta
# targets = ['a', 'b']

# distributions = [[0.95, 0.05], [0.05, 0.95], [0.5, 0.5]]

# def get_distribution(val):
#     if val <= 3:
#         return distributions[0]
#     elif val <= 6:
#         return distributions[1]
#     else:
#         return distributions[2]

# def generate_target(data):
#     target = []
#     for val in data:
#         target += (random.choices(targets, get_distribution(val)))
#     return np.array(target)

# # generate true Xmacro labels
# def get_Xmacro(Xmicro_label):
#     Xmacro = np.zeros(Xmicro_label.shape)
#     for xmli,xml in enumerate(Xmicro_label):
#         if xml <= 3:
#             Xmacro[xmli] = 0
#         elif xml <= 6:
#             Xmacro[xmli] = 1
#         else:
#             Xmacro[xmli] = 2
#     return Xmacro

#     # CDE parameters
# CNN_params = { # parameters for model creation
#                 'filters'         : [32, 16],
#                 'input_shape'     : (28, 28, 1),
#                 'kernel_size'     : [(3, 3)] *2,
#                 'pool_size'       : [(2, 2)] *2,
#                 'padding'         : ['same'] *2,
#                 'conv_activation' : ['softmax', 'softmax'],
#                 'dense_units'     : 500,
#                 'dense_activation' : 'softmax',
#                 'output_activation': 'softmax',
    
#                 # parameters for training
#                     'batch_size'  : 128,
#                     'n_epochs'    : 30,
#                     'optimizer'   : 'adam',
#                     'opt_config'  : {},
#                     'verbose'     : 2,
#                     'weights_path': None,
#                     'loss'        : 'categorical_crossentropy',
#                     'show_plot'   : True,
#                     'standardize' : False,
#                     'best'        : True,
#               }

In [11]:
# # causal mnist

# # load data
# (train_X, train_y), (test_X, test_y) = mnist.load_data()

# # Keep only data 1-9
# train_X = train_X[train_y != 0]
# train_y = train_y[train_y != 0]

# del test_X, test_y

# # We must divide convert the image values from [0, 255] to [0, 1] to speed up training
# MAX_RGB = 255
# train_X = np.true_divide(train_X, MAX_RGB)
# train_X = np.expand_dims(train_X, -1)
# macro_X = get_Xmacro(train_y)
# target_Y = util.data_processing.one_hot_encode(generate_target(train_y), targets)
# print(train_X.shape)
# print(train_y.shape)
# print(macro_X.shape)
# print(target_Y.shape)

# data_info = { 'X_dims' : train_X.shape, 
#               'Y_dims' : target_Y.shape, 
#               'Y_type' : 'categorical' } 
                         
# block_names = ['CondExpCNN']
# block_params = [CNN_params]

# save_path = '/Users/imanwahle/Desktop/cfl/cfl/cluster_methods/compare_methods/tmp_cfl_results'
# my_exp = Experiment(X_train=train_X, Y_train=target_Y, data_info=data_info, 
#                     block_names=block_names, block_params=block_params, 
#                     blocks=None, results_path=save_path)

# results = my_exp.train()
# pyx = results['CondExpCNN']['pyx']

# # package data
# # we only want n_samples points from mnist for now
# np.random.seed(random_state)
# idx = np.random.choice(range(train_X.shape[0]), n_samples, replace=False)
# data = [pyx[idx,], macro_X[idx,]]
# save_data(data, 'causal_mnist')
# visualize_data(data, 'causal_mnist')

In [12]:
# # visual bars CDE output
# X, pyx_gt, Y = get_vb_data(n_samples, 0.0, random_state)

# # format data for CDE training
# X = np.expand_dims(X, -1)
# Y = one_hot_encode(Y, unique_labels=[0,1])

# data_info = {'X_dims': X.shape,
#              'Y_dims': Y.shape,
#              'Y_type': 'categorical'}

# block_names = ['CondExpCNN']
# block_params = [CNN_params]

# save_path = '/Users/imanwahle/Desktop/cfl/cfl/cluster_methods/compare_methods/tmp_cfl_results'
# my_exp = Experiment(X_train=X, Y_train=Y, data_info=data_info, 
#                     block_names=block_names, block_params=block_params, 
#                     blocks=None, results_path=save_path)

# results = my_exp.train()
# pyx = results['CondExpCNN']['pyx']

# # package data
# data = [pyx, pyx_gt]
# save_data(data, data_names[3])
# visualize_data(data, data_names[3])

In [13]:
# # visual bars CDE output
# X, pyx_gt, Y = get_vb_data(n_samples, 0.1, random_state)

# # format data for CDE training
# X = np.expand_dims(X, -1)
# Y = one_hot_encode(Y, unique_labels=[0,1])

# data_info = {'X_dims': X.shape,
#              'Y_dims': Y.shape,
#              'Y_type': 'categorical'}

# block_names = ['CondExpCNN']
# block_params = [CNN_params]

# save_path = '/Users/imanwahle/Desktop/cfl/cfl/cluster_methods/compare_methods/tmp_cfl_results'
# my_exp = Experiment(X_train=X, Y_train=Y, data_info=data_info, 
#                     block_names=block_names, block_params=block_params, 
#                     blocks=None, results_path=save_path)

# results = my_exp.train()
# pyx = results['CondExpCNN']['pyx']

# # package data
# data = [pyx, pyx_gt]
# save_data(data, data_names[4])
# visualize_data(data, data_names[4])

In [14]:
# # uniform variance blobs
# for n_features in np.arange(1000,7000,1000):
#     data = datasets.make_blobs(n_samples=n_samples, n_features=n_features, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)
#     save_data(data, f'blobs_const_{n_features}')
#     visualize_data(data, f'blobs_const_{n_features}')

In [15]:
# # mnist with varied resolution
# from skimage.transform import resize

# res = np.arange(10,120,20)
# for r,ri in tqdm(enumerate(res)):

#     (train_X, train_y), (test_X, test_y) = mnist.load_data()
#     del train_X, train_y

#     # we only want n_samples points from mnist for now
#     np.random.seed(random_state)
#     idx = np.random.choice(range(test_X.shape[0]), 10, replace=False)
#     test_X = test_X[idx,:]
#     test_y = test_y[idx,]

#     # vary image resolution and vectorize
#     test_X = np.array([resize(test_X[i], (r,r)) for i in range(test_X.shape[0])])
#     test_X = np.reshape(test_X, (test_X.shape[0], test_X.shape[1]*test_X.shape[2]))

#     # package it all up
#     data = [test_X, test_y]

#     save_data(data, f'mnist_{r}')
#     visualize_data(data, f'mnist_{r}')

In [25]:
# el nino  cde output
import joblib
from sklearn.preprocessing import StandardScaler

# load data
cfl_path = '/Users/imanwahle/Desktop/cfl' # set this to your own cfl location
X, Y, coords = joblib.load(os.path.join(cfl_path, 'data/el_nino/elnino_data.pkl'))
imshape = (55, 9)

# standardize data
X = StandardScaler().fit_transform(X)
Y = StandardScaler().fit_transform(Y)

# set all CFL parameters

# generic data parameters
data_info = { 'X_dims' : X.shape, 
              'Y_dims' : Y.shape,
              'Y_type' : 'continuous' } 

# CDE parameters
lr = 1e-4
CDE_params = { 'batch_size'  : 64,
               'optimizer'   : 'adam',
               'n_epochs'    : 40,
               'verbose'     : True,
               'dense_units' : [1024, 1024, data_info['Y_dims'][1]],
               'activations' : ['linear', 'linear', 'linear'],
               'dropouts'    : [0.1, 0.1, 0.0],
}


block_names = ['CondExpMod']
block_params = [CDE_params]

save_path = '/Users/imanwahle/Desktop/cfl/cfl/cluster_methods/compare_methods/tmp_cfl_results'
my_exp = Experiment(X_train=X, Y_train=Y, data_info=data_info, 
                    block_names=block_names, block_params=block_params, 
                    blocks=None, results_path=save_path)

results = my_exp.train()
pyx = results['CondExpMod']['pyx']

# package data
pyx_gt = -1 * np.ones((pyx.shape[0],))
data = [pyx, pyx_gt]
save_data(data, 'el_nino_pyx')
visualize_data(data, 'el_nino_pyx')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/imanwahle/Desktop/cfl/cfl/cluster_methods/compare_methods/data_/el_nino_pyx/embedding.npy'