In [1]:
import os
import pandas as pd
import GPy
import numpy as np
from IPython.display import display
from scipy.stats import norm

In [2]:
# Load data to train model on

source_table = pd.read_csv("source_data.csv", sep=" ")
source_matrix = np.asfortranarray(source_table.values.T)

# Normalize data

epsilon = 1e-6
means = np.mean(source_matrix, axis=0)
stds = np.std(source_matrix, axis=0) + epsilon

normed = (source_matrix - means) / stds
normed_source_matrix = normed

# Set parameters and make output dir

gene_number = 2

feature_model_format = 'model_{0:05d}'
output_dir = os.path.join("gpy_feature_models", feature_model_format.format(gene_number))
if os.path.exists(output_dir) is False:
    os.mkdir(output_dir)

input_dim = source_table.shape[0]-1
    
# Split out feature to predict using all other features

is_i = np.in1d(np.arange(normed_source_matrix.shape[1]), gene_number)
data_x_train = normed_source_matrix[:, ~is_i]
data_y_train = normed_source_matrix[:, is_i]

In [3]:
kernel = GPy.kern.Linear(input_dim=input_dim)
model = GPy.models.GPRegression(data_x_train, data_y_train, kernel=kernel.copy())

In [None]:
# Fit model

model.optimize()

In [None]:
np.savetxt(os.path.join(output_dir, "param_array.txt"),  model.param_array)

In [None]:
display(model)

In [None]:
# Load target data
target_table = pd.read_csv("target_data.csv", sep = " ")
target_matrix = np.asfortranarray(target_table.values.T)

# Normalize data

epsilon = 1e-6

"""
NB: It's important to use means/stds from the training data, since that's how the
training data was normalized.
"""

# means = np.mean(target_matrix, axis=0)
# stds = np.std(target_matrix, axis=0) + epsilon

normed = (target_matrix - means) / stds
normed_target_matrix = normed

# Split feature out of target data

is_feature = np.in1d(np.arange(normed_target_matrix.shape[1]), gene_number)
data_x_test = normed_target_matrix[:, ~is_feature]
data_y_test = normed_target_matrix[:, is_feature]

In [None]:
# Save data used by GPy to a file to use in gpytorch_wenda_gpy_data.ipynb
import torch

torch.save(
    [
        torch.from_numpy(data_x_train),
        torch.from_numpy(data_y_train).squeeze(-1),
        torch.from_numpy(data_x_test),
        torch.from_numpy(data_y_test).squeeze(-1)
    ],
    'gpy_data.pth'
)

In [None]:
# Calculate confidence score using target data
mu, sigma_sq = model.predict(data_x_test)
res_normed = (data_y_test - mu) / np.sqrt(sigma_sq)
confidences = (1 - abs(norm.cdf(res_normed) - norm.cdf(-res_normed)))

In [None]:
np.savetxt(os.path.join(output_dir, "confidences.txt"), confidences, fmt='%.10f')
np.savetxt(os.path.join(output_dir, "predicted_means.txt"), mu, fmt='%.5f')
np.savetxt(os.path.join(output_dir, "predicted_variances.txt"), sigma_sq)