In [260]:
import encoding_tools
from pathlib import Path
import numpy as np
import pickle
import pandas as pd

# ML imports
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import distance
from scipy import optimize, linalg
import scipy
from sklearn.model_selection import KFold # import KFold

# custom imports
import encoding_tools as encoding
import chimera_tools as chimera
import GP_tools as GP

In [249]:
Processed_Folder = Path(r"Phosphotase_Encode.ipynb").parent.absolute() / Path("Processed Data")

dicts = ['EFI_ID_List', 'metabolite_dict', 'Protein_seq_dict']

with open(Processed_Folder / Path('EFI_ID_List.p'), 'rb') as EFI_ID:
    EFI_ID_List = pickle.load(EFI_ID)

with open(Processed_Folder / Path('metabolite_dict.p'), 'rb') as metabolite:
    metabolite_dict = pickle.load(metabolite)

with open(Processed_Folder / Path('Protein_seq_dict.p'), 'rb') as Protein_seq:
    Protein_seq_dict = pickle.load(Protein_seq)

activations = pd.read_csv(Processed_Folder / Path('activations.csv'), index_col=0)

In [224]:
# Need to pad protein sequences to the max length of the longest one
max_len = len(max(Protein_seq_dict.values(), key=len))
fillchar = '-' # This is whats used in the GP-UCB paper
Padded_dict = {}
OH_dict = {}
for ID in EFI_ID_List:
    Padded_dict[ID] = Protein_seq_dict[ID].upper().ljust(max_len, fillchar)
    OH_dict[ID] = encoding_tools.one_hot_seq(seq_input=Padded_dict[ID])

In [251]:
# Preparing input training data X to feed into ML Model
input_len = len(OH_dict[EFI_ID_List[0]])*21
num_inputs = len(OH_dict.keys())

X = np.zeros((num_inputs,input_len))
for i in range(0,len(EFI_ID_List)):
    ID = EFI_ID_List[i]
    X_seq = OH_dict[ID]
    X_seq = np.reshape(X_seq,(1,X_seq.shape[0]*21))
    X[i,:] = X_seq

# Preapre output training data y to feed into ML Model
y = activations.values[0,:]

In [205]:
ID = np.random.randint(low=0,high=218)
len_comp = len(Padded_dict[EFI_ID_List[ID]])
len(Padded_dict[EFI_ID_List[0]]) == len(Padded_dict[EFI_ID_List[ID]])
print(len_comp)
print(ID) 

604
114


In [252]:
def ML_train(X, y):
    # test the optimization of the hyp-prams
    initial_guess = [0.9,0.9]

    # take the log of the initial guess for optimiziation 
    initial_guess_log = np.log(initial_guess)

    # optimize to fit model
    result = scipy.optimize.minimize(GP.neg_log_marg_likelihood, initial_guess_log, args=(X,y), method='L-BFGS-B')
    
    print('Full GP regression model')
    print('Hyperparameters: ' + str(np.exp(result.x[0])) + ' ' + str(np.exp(result.x[1])))

    # next set of hyper prams 
    final_prams = [np.exp(result.x[0]), np.exp(result.x[1])]
    
    return final_prams
    

In [261]:
final_prams = ML_train(X, y)

Full GP regression model
Hyperparameters: 2.24125819484e-11 203.879134241
