In [1]:
import math
import torch
from matplotlib import pyplot as plt
%matplotlib inline

import os
import pandas as pd
from pandas import factorize
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder

In [2]:
import re
os.chdir('/Users/chenya68/Documents/GitHub/BFO')
df_x = pd.read_excel('data/harpoon-doe.xlsx',sheet_name = 0, usecols = [1,2,3,4])
df_x.columns = [re.sub('[^A-Za-z0-9Δ]+', '_', element) for element in df_x.columns]
#print(len(df_x))
#df_x.head()

In [3]:
df_y_1 = pd.read_excel('data/harpoon-doe.xlsx',sheet_name = 1, usecols = [1,4,7],skiprows = lambda x: x in [1])
df_y_1.columns = [re.sub('[^A-Za-z0-9Δ]+', '_', element) for element in df_y_1.columns]
#print(len(df_y_1))
#print(df_y_1.head())

df_y_2 = pd.read_excel('data/harpoon-doe.xlsx',sheet_name = 1, usecols = [2,5,8],skiprows = lambda x: x in [1])
df_y_2.columns = df_y_1.columns
#print(len(df_y_2))
#print(df_y_2.head())

df_y_3 = pd.read_excel('data/harpoon-doe.xlsx',sheet_name = 1, usecols = [3,6,9],skiprows = lambda x: x in [1])
df_y_3.columns = df_y_1.columns
#print(len(df_y_3))
#print(df_y_3.head())

In [4]:
cols_feature01 = list(df_x.columns)
cols_target = list(df_y_1.columns)[0:2]
#cols_target = list(df_y_1.columns)
cols_cate = ['Buffer_Type',
 'Sugar_Salt',
 'Additive']

In [5]:
total_df_y_long = pd.concat((df_y_1[cols_target],df_y_2[cols_target],df_y_3[cols_target]),axis = 0)
total_df_y_long.reset_index(inplace = True, drop = True)
total_df_y_long

Unnamed: 0,_Dimer_HMW_,_Monomer
0,2.4,95.1
1,0.8,97.3
2,3.2,94.3
3,1.4,96.7
4,4.2,93.3
...,...,...
67,3.3,92.3
68,3.7,92.3
69,3.6,92.2
70,4.2,90.8


In [6]:
df_y_1.columns = [c+'_1' for c in df_y_1.columns]
df_y_2.columns = [c+'_2' for c in df_y_2.columns]
df_y_3.columns = [c+'_3' for c in df_y_3.columns]

In [7]:
cols_target_new = ['_Dimer_HMW_1', '_Monomer_1',
       '_Dimer_HMW_2', '_Monomer_2', 
       '_Dimer_HMW_3','_Monomer_3']

arr_Y = np.concatenate((df_y_1.to_numpy()[:,:2],
                            df_y_2.to_numpy()[:,:2],
                            df_y_3.to_numpy()[:,:2]),axis = 1)
total_df_y_wide = pd.DataFrame(arr_Y,columns= cols_target_new)

In [8]:
#convert categorical columns to labels
for x_name in cols_cate:
    labels, categories = factorize(df_x[x_name])
    df_x[x_name+"_label"] = labels
df_x.drop(cols_cate,axis = 1,inplace = True)
display(df_x.head())

Unnamed: 0,pH,Buffer_Type_label,Sugar_Salt_label,Additive_label
0,4.5,0,0,0
1,4.5,0,0,1
2,5.0,0,0,0
3,5.0,0,0,1
4,5.0,0,1,1


In [9]:
ls_model = ['simpleGP','multi-task-single-output','multi-task-multi-output','multi-task-multi-input-multi-output']
ls_x_scale = ['no-x-scale','x-minmax','x-stand','x-robust']
ls_y_scale = ['no-y-scale','y-minmax','y-stand','y-robust']
ls_cate_transform = ['label','ohe','LVGP','full-LMGP','partial-LMGP']
ls_remove_pred_outlier = [0,1]
ls_output_rank_option = [1,2]
ls_task_rank_option = [1,2,3,4,5,6,7]
ls_lik_rank_option = [0,1,2]
ls_split_option = ['mix','separate'] #mix: combine all tasks first then do train, test split (could stratify task?) #separate, do train-test-split first, then combine tasks
ls_stratify_task = ['not-stratify','stratify-x','stratify-y','stratify-xy']


model_option = 'multi-task-multi-output'
x_scale_option = 'x-minmax'
y_scale_option = 'y-minmax'
cate_transform_option = 'partial-LMGP'
#remove_pred_outlier_option= 0

output_rank_option = 2 #if 0, no correlation between output
task_rank_option = 3#if 0, no correlation between tasks
lik_rank_option = 1
split_option = 'mix'
stratify_option = 'stratify-x'

noise_option = 0.2 #noise percentage

model_label = model_option
x_scale_label = x_scale_option
y_scale_label = y_scale_option
cate_transform_label = 'cate_transform_'+cate_transform_option
#remove_pred_outlier_label = 'remove_pred_outlier_'+str(remove_pred_outlier_option)
output_rank_label = 'output_rank_'+str(output_rank_option)
task_rank_label = 'task_rank_'+str(task_rank_option)
lik_rank_label = 'lik_rank_'+str(lik_rank_option)
split_label = split_option
stratify_label = stratify_option
if noise_option>0:
        noise_label = 'noise_'+str(noise_option)
else:
        noise_label = ''

folder_name = '-'.join([model_label,task_rank_label,output_rank_label,lik_rank_label,x_scale_label,y_scale_label,cate_transform_label,
                        split_label,stratify_label,noise_label])

figPath = 'output/recreate-gp/'+folder_name
if not os.path.exists(figPath):
        print(f'Creating folder {figPath}')
        os.makedirs(figPath,exist_ok = True)

Creating folder output/recreate-gp/multi-task-multi-output-task_rank_3-output_rank_2-lik_rank_1-x-minmax-y-minmax-cate_transform_partial-LMGP-mix-stratify-x-noise_0.2


In [10]:
split_label = 'mix'
if split_label == 'mix':
    ls_X = [df_x.copy(),
            df_x.copy(),
            df_x.copy()]
    
    for i,tmp_df_x in enumerate(ls_X):
        tmp_df_x['task_ind'] = i

    df_X = pd.concat(ls_X)
    df_X.reset_index(inplace=True, drop = True)
    #print(df_X.head())

In [11]:
np.random.seed(42)
N = len(df_X)
#percentage = 0.05
if noise_option>0:
#create data with noise
    df_X_syn = df_X.copy()
    for col in df_X.columns.difference(['task_ind']):
        df_X_syn[col] = df_X_syn[col] + np.random.normal(0, df_X_syn[col].std(), N) * noise_option

    df_Y_syn= total_df_y_long[cols_target].copy()
    for col in cols_target:
        df_Y_syn[col] = df_Y_syn[col] + np.random.normal(0, df_Y_syn[col].std(), N) * noise_option

In [12]:
multi_task_label = 'hier'
#cols_feature_new = df_x.columns

if noise_option>0:
    df_total_X = pd.concat((df_X,df_X_syn))
    df_total_Y = pd.concat((total_df_y_long[cols_target],df_Y_syn))
else:
    df_total_X = df_X
    df_total_Y = total_df_y_long

from sklearn.cluster import KMeans
ls_n_clusters = [4,2,2]
for i,x_name in enumerate(['Buffer_Type_label','Sugar_Salt_label','Additive_label']):
    best_n_clusters = ls_n_clusters[i]
    km = KMeans(n_clusters=best_n_clusters, random_state=10)
    kmeans = km.fit(df_total_X[[x_name]])
    df_total_X[x_name] = kmeans.labels_

if multi_task_label == 'hier':
    df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_total_X, df_total_Y, test_size=0.2, 
                                                                random_state=0, 
                                                                stratify=df_total_X['task_ind'])

else:
    df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_total_X, df_total_Y, test_size=0.2, 
                                                                random_state=0)

In [13]:
from sklearn.compose import ColumnTransformer
xct = ColumnTransformer([('x_mm_scaler',MinMaxScaler(),
                          df_X_train.columns.difference(['Buffer_Type_label','Sugar_Salt_label','Additive_label','task_ind']))], 
                         remainder = 'passthrough')

scaled_X_train=xct.fit_transform(df_X_train) 
scaled_X_test=xct.transform(df_X_test)

t_train_x = torch.Tensor(scaled_X_train)
t_test_x = torch.Tensor(scaled_X_test)

In [14]:
qual_ind_lev = {1: 4, 2:2, 3:2}
quant_index = [0]
task_index = 4

In [17]:
#y_scale_label = 'y-stand'
y_scale_label = 'y-minmax'

scaled_y_train = np.zeros_like(df_y_train.to_numpy())
scaled_y_test = np.zeros_like(df_y_test.to_numpy())
ls_y_task_scaler = []
ls_row_idx_train = []
ls_row_idx_test = []
#if y_scale_option==1:
if y_scale_label == 'y-robust':
    y_scaler = RobustScaler()
    scaled_y_train = y_scaler.fit_transform(df_y_train)
    scaled_y_test= y_scaler.transform(df_y_test)
elif y_scale_label == 'y-stand':
    #y_scaler = StandardScaler()
    #scaled_y_train = y_scaler.fit_transform(df_y_train)
    #scaled_y_test= y_scaler.transform(df_y_test)
    for task_ind in range(3):
        y_task_scaler = StandardScaler()
        row_idx_train = np.where(df_X_train['task_ind']==task_ind)[0]
        ls_row_idx_train.append(row_idx_train)
        row_idx_test = np.where(df_X_test['task_ind']==task_ind)[0]
        ls_row_idx_test.append(row_idx_test)
        scaled_y_train_task =y_task_scaler.fit_transform(df_y_train[df_X_train['task_ind']==task_ind])
        scaled_y_test_task= y_task_scaler.transform(df_y_test[df_X_test['task_ind']==task_ind])
        ls_y_task_scaler.append(y_task_scaler)
        scaled_y_train[row_idx_train] = scaled_y_train_task
        scaled_y_test[row_idx_test] = scaled_y_test_task
elif y_scale_label == 'y-minmax':
    y_scaler = MinMaxScaler()
    scaled_y_train = y_scaler.fit_transform(df_y_train)
    scaled_y_test= y_scaler.transform(df_y_test)
else:
    scaled_y_train = df_y_train.to_numpy()
    scaled_y_test = df_y_test.to_numpy()

t_train_y = torch.Tensor(scaled_y_train)

In [18]:
import gpytorch
#from gpplus import kernels
#from gpplus.priors import LogHalfHorseshoePrior,MollifiedUniformPrior
from gpytorch.priors import NormalPrior,LogNormalPrior
from gpytorch.constraints import GreaterThan,Positive

class MultiOutputMultiTaskGP(gpytorch.models.ExactGP):

    def __init__(
        self,
        train_X,
        train_Y,
        data_kernel,
        noise_indices,
        fix_noise:bool=False,
        lb_noise:float=1e-4,
        task_rank = None,
        output_rank = None,
        lik_rank = None
    ) -> None:

        num_outputs = train_Y.shape[-1]
        num_tasks = len(torch.unique(train_X[..., -1]))
        self._num_tasks = num_tasks
        self._num_outputs = num_outputs
        
        self.task_rank = task_rank if task_rank is not None else num_tasks
        self.output_rank = output_rank if output_rank is not None else num_outputs
        self.lik_rank = lik_rank if lik_rank is not None else 0
        # initializing likelihood
        likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=num_outputs,rank = self.lik_rank)
        super(MultiOutputMultiTaskGP, self).__init__(train_X, train_Y,likelihood)

        self.likelihood.register_prior('raw_noise_prior',LogHalfHorseshoePrior(0.01,lb_noise),'raw_noise')
        if self.lik_rank == 0:
            self.likelihood.register_prior('raw_task_noises_prior',LogHalfHorseshoePrior(0.01,lb_noise),'raw_task_noises')    
        else:
            self.likelihood.register_prior('task_noise_covar_factor_prior',NormalPrior(0.,1),'task_noise_covar_factor')

        #self.likelihood.register_prior('raw_task_noises_prior',LogHalfHorseshoePrior(0.01,lb_noise),'raw_task_noises')
        #self.likelihood.register_prior('raw_noise_prior',LogHalfHorseshoePrior(0.01,lb_noise),'raw_noise')
        if fix_noise:
            self.likelihood.raw_noise.requires_grad_(False)
            self.likelihood.noise_covar.noise =torch.tensor(4.9901e-05)

        
        #define prior for mean module
        mean_list = [gpytorch.means.ConstantMean(NormalPrior(0,1)) for t in range(num_outputs)]
        self.mean_module = gpytorch.means.MultitaskMean(
            mean_list, num_tasks=num_outputs
        )
        
        self.data_kernel = data_kernel
        if isinstance(data_kernel,str):
            try:
                data_kernel_class = getattr(kernels,data_kernel)
                self.data_kernel = data_kernel_class(
                    ard_num_dims = self.train_inputs[0].size(1),
                    lengthscale_constraint=Positive(transform=torch.exp,inv_transform=torch.log),
                )
                
                self.data_kernel.register_prior(
                    'lengthscale_prior',MollifiedUniformPrior(math.log(0.1),math.log(10)),'raw_lengthscale'
                )
                
            except:
                raise RuntimeError(
                    "%s not an allowed kernel" % data_kernel
                )
        
        elif not isinstance(data_kernel,gpytorch.kernels.Kernel):
            raise RuntimeError(
                "specified data kernel is not a `gpytorch.kernels.Kernel` instance"
            )

        #define kernel for gplvm on mixed variables
        self.data_kernel2 = gpytorch.kernels.RBFKernel()
        self.data_kernel2.register_prior(
                    'lengthscale_prior',MollifiedUniformPrior(math.log(0.1),math.log(10)),'raw_lengthscale'
                )

        self.task_kernel = gpytorch.kernels.IndexKernel(num_tasks=num_tasks, rank = self.task_rank) #default rank is 1
        self.output_kernel = gpytorch.kernels.IndexKernel(num_tasks=num_outputs, rank = self.output_rank) #default rank is 1
        
        self.task_kernel.register_prior("covar_factor_prior",NormalPrior(0.,1),lambda m: m._parameters['covar_factor'])
        self.task_kernel.register_prior("raw_var_prior",NormalPrior(0.,1),lambda m: m._parameters['raw_var'])

        self.output_kernel.register_prior("covar_factor_prior",NormalPrior(0.,1),lambda m: m._parameters['covar_factor'])
        self.output_kernel.register_prior("raw_var_prior",NormalPrior(0.,1),lambda m: m._parameters['raw_var'])
        
        
    def forward(self, x):
        mean_x = self.mean_module(x)
        task_term = self.task_kernel(x[..., -1].long())
        data_and_task_x = self.data_kernel(x[..., :-1]).mul(task_term)
        output_x = self.output_kernel.covar_matrix
        covar_x = gpytorch.lazy.KroneckerProductLazyTensor(data_and_task_x, output_x)
        return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)
    
    def predict(
        self,x:torch.Tensor,return_std:bool=False,include_noise:bool=False
    ):

        self.eval()
        self.likelihood.eval()
        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            pred_res = self.likelihood(self.forward(x))   
            mean = pred_res.mean
            lower, upper = pred_res.confidence_region()
        return mean, lower, upper