In [1]:
from covariance_analysis import *
from IPython.display import Markdown

from virtual_screening.function import *
from virtual_screening.evaluation import *
from virtual_screening.models.deep_classification import *

Using Theano backend.
Using gpu device 1: Tesla K40m (CNMeM is disabled, CuDNN 4004)


In [2]:
running_index = 1
k = 5

# Get Input Feature

In [3]:
directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('{}file_{}.csv'.format(directory, i))
file_list = np.array(file_list)

# read data
test_index = running_index / 4
val_index = running_index % 4 + (running_index % 4 >= test_index)
complete_index = np.arange(k)
train_index = np.where((complete_index != test_index) & (complete_index != val_index))[0]

train_file_list = file_list[train_index]
val_file_list = file_list[val_index:val_index+1]
test_file_list = file_list[test_index:test_index+1]

In [4]:
label_name_list = ['Keck_Pria_AS_Retest']

train_pd = filter_out_missing_values(read_merged_data(train_file_list), label_list=label_name_list)
val_pd = filter_out_missing_values(read_merged_data(val_file_list), label_list=label_name_list)
test_pd = filter_out_missing_values(read_merged_data(test_file_list), label_list=label_name_list)

# extract data, and split training data into training and val
X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=label_name_list)
X_val, y_val = extract_feature_and_label(val_pd,
                                         feature_name='Fingerprints',
                                         label_name_list=label_name_list)
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=label_name_list)

(43453, 1)
(14484, 1)
(14486, 1)


In [5]:
pos_num = 50
neg_num = 200
pos_index_list = sample_index(y_train, 1, pos_num)
neg_index_list = sample_index(y_train, 0, neg_num)
pos_num = pos_index_list.shape[0]
neg_num = neg_index_list.shape[0]
N = pos_num + neg_num
print 'Extract {} pos\t{} neg'.format(pos_num, neg_num)

Extract 47 pos	200 neg


In [6]:
sample_pos_X = X_train[pos_index_list]
sample_neg_X = X_train[neg_index_list]
feature_X = np.vstack((sample_pos_X, sample_neg_X))
print feature_X.shape

(247, 1024)


# Test Single-task Covariance Matrix

In [7]:
with open('../../output/cross_validation/single_classification_22.json', 'r') as f:
        conf = json.load(f)
single_task = SingleClassification(conf=conf)

In [8]:
PMTNN_weight_file = '../../output/cross_validation/single_classification_22/45540945/{}.weight'.format(running_index)
print PMTNN_weight_file
# single_task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test,
#                            PMTNN_weight_file=PMTNN_weight_file)

../../output/cross_validation/single_classification_22/45540945/1.weight


In [9]:
single_model = single_task.setup_model()
single_model.load_weights(PMTNN_weight_file)

In [10]:
output_layer = single_model.layers.pop()
dropout_2nd = single_model.layers.pop()
dense_2nd = single_model.layers.pop()
single_model.outputs = [single_model.layers[-1].output]
single_model.layers[-1].outbound_nodes = []
print single_model.layers

[<keras.layers.core.Dense object at 0x7fe5d8132990>, <keras.layers.core.Dropout object at 0x7fe5d799c910>, <keras.layers.core.Dense object at 0x7fe5d7715790>]


## Save Single-task Matrix

In [11]:
hidden_pos_X = single_model.predict(sample_pos_X)
hidden_neg_X = single_model.predict(sample_neg_X)
hidden_X = np.vstack((hidden_pos_X, hidden_neg_X))
print hidden_X.shape

(247, 2000)


In [12]:
hidden_covariance_matrix = np.cov(hidden_X)
hidden_cosine_matrix = get_distance_metrics(hidden_X)

feature_covariance_matrix = np.cov(feature_X)
feature_cosine_matrix = get_distance_metrics(feature_X)

In [13]:
book = Workbook(style_compression=2)
book_set_custom_colour(book)

sheet = book.add_sheet('Covariance_Matrix_Hidden_Layer')
sheet_write_matrix(sheet, hidden_covariance_matrix, pos_num)

sheet = book.add_sheet('Covariance_Matrix_Input_feature')
sheet_write_matrix(sheet, feature_covariance_matrix, pos_num)

sheet = book.add_sheet('Cosine_Matrix_Hidden_Layer')
sheet_write_matrix(sheet, hidden_cosine_matrix, pos_num)

sheet = book.add_sheet('Cosine_Matrix_Input_feature')
sheet_write_matrix(sheet, feature_cosine_matrix, pos_num)

book.save('plottings/single_task_running_{}.xls'.format(running_index))

## Color Mapping for Covariance/Cosine Similarity Matrix

<table>
    <caption>Output Format</caption>
    <tr>
        <td></td><td>pos</td><td>neg</td>
    </tr>
    <tr>
        <td>pos</td><td bgcolor=#66ff66 /></td><td bgcolor=#ffff99 />
    </tr>
    <tr>
        <td>neg</td><td bgcolor=#ffff99 /></td><td bgcolor=#66ffff />
    </tr>
</table>

In [14]:
# Markdown(get_highlighted_markdown_table(hidden_covariance_matrix, pos_num))

# Test Multi-task Covariance Matrix

In [15]:
# directory = '../../dataset/keck_pcba/fold_{}/'.format(k)
# file_list = []
# for i in range(k):
#     file_list.append('{}file_{}.csv'.format(directory, i))
# file_list = np.array(file_list)

# # read data
# test_index = running_index / 4
# val_index = running_index % 4 + (running_index % 4 >= test_index)
# complete_index = np.arange(k)
# train_index = np.where((complete_index != test_index) & (complete_index != val_index))[0]

# train_file_list = file_list[train_index]
# val_file_list = file_list[val_index:val_index+1]
# test_file_list = file_list[test_index:test_index+1]

In [16]:
# train_pd = read_merged_data(train_file_list)
# train_pd.fillna(0, inplace=True)
# val_pd = read_merged_data(val_file_list)
# val_pd.fillna(0, inplace=True)
# test_pd = read_merged_data(test_file_list)
# test_pd.fillna(0, inplace=True)

# multi_name_list = train_pd.columns[-128:].tolist()
# multi_name_list.extend(label_name_list)
# print 'multi_name_list ', multi_name_list

# X_train, y_train = extract_feature_and_label(train_pd,
#                                              feature_name='Fingerprints',
#                                              label_name_list=multi_name_list)
# X_val, y_val = extract_feature_and_label(val_pd,
#                                          feature_name='Fingerprints',
#                                          label_name_list=multi_name_list)
# X_test, y_test = extract_feature_and_label(test_pd,
#                                            feature_name='Fingerprints',
#                                            label_name_list=multi_name_list)

In [17]:
with open('../../output/cross_validation/multi_classification_15.json', 'r') as f:
        conf = json.load(f)
multi_task = MultiClassification(conf=conf)

In [18]:
PMTNN_weight_file = '../../output/cross_validation/multi_classification_15/45983730/{}.weight'.format(running_index)
print PMTNN_weight_file
# multi_task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test,
#                                  PMTNN_weight_file=PMTNN_weight_file,
#                                  score_file='plottings/score.csv')

../../output/cross_validation/multi_classification_15/45983730/1.weight


In [19]:
multi_model = multi_task.setup_model()
multi_model.load_weights(PMTNN_weight_file)

In [20]:
output_layer = multi_model.layers.pop()
dropout_2nd = multi_model.layers.pop()
dense_2nd = multi_model.layers.pop()
multi_model.outputs = [multi_model.layers[-1].output]
multi_model.layers[-1].outbound_nodes = []
print multi_model.layers

[<keras.layers.core.Dense object at 0x7fe5d7627850>, <keras.layers.core.Dropout object at 0x7fe57fd8a8d0>, <keras.layers.core.Dense object at 0x7fe57fd8a150>]


## Save Multi-task Matrix

In [21]:
hidden_pos_X = multi_model.predict(sample_pos_X)
hidden_neg_X = multi_model.predict(sample_neg_X)
hidden_X = np.vstack((hidden_pos_X, hidden_neg_X))
print hidden_X.shape

(247, 2000)


In [22]:
hidden_covariance_matrix = np.cov(hidden_X)
hidden_cosine_matrix = get_distance_metrics(hidden_X)

In [23]:
book = Workbook(style_compression=2)
book_set_custom_colour(book)

sheet = book.add_sheet('Covariance_Matrix_Hidden_Layer')
sheet_write_matrix(sheet, hidden_covariance_matrix, pos_num)

sheet = book.add_sheet('Covariance_Matrix_Input_feature')
sheet_write_matrix(sheet, feature_covariance_matrix, pos_num)

sheet = book.add_sheet('Cosine_Matrix_Hidden_Layer')
sheet_write_matrix(sheet, hidden_cosine_matrix, pos_num)

sheet = book.add_sheet('Cosine_Matrix_Input_feature')
sheet_write_matrix(sheet, feature_cosine_matrix, pos_num)

book.save('plottings/multi_task_running_{}.xls'.format(running_index))