In [1]:
from function import *
from data_preparation import *
from evaluation import *

from openpyxl import Workbook
import pandas as pd
import numpy as np
import operator

from rdkit import Chem
from rdkit.Chem import AllChem

# Pre-prepare Data (Data Transformation)

In [2]:
transform_data(output_file_name='../dataset/keck_complete.csv')

# Analysis

In [48]:
complete_df = pd.read_csv('../dataset/keck_complete.csv')

print complete_df.dtypes

Molecule                   object
SMILES                     object
Fingerprints               object
Keck_Pria_AS_Retest         int64
Keck_Pria_FP_data           int64
Keck_Pria_Continuous      float64
Keck_RMI_cdd              float64
FP counts % inhibition    float64
dtype: object


In [3]:
cnt_pria_retest = 0
cnt_pria_fp = 0
cnt_rmi_cdd = 0

cnt_dict = {}
for ix, row in complete_df.iterrows():
    cnt = 0
    if row['Keck_Pria_AS_Retest'] == 1:
        cnt_pria_retest += 1
        cnt += 1
    if row['Keck_Pria_FP_data'] == 1:
        cnt_pria_fp += 1
        cnt += 1
    if row['Keck_RMI_cdd'] == 1:
        cnt_rmi_cdd += 1
        cnt += 1
    if cnt not in cnt_dict.keys():
        cnt_dict[cnt] = 0
    cnt_dict[cnt] += 1

print 'pria retest active: {}\tpria fp active: {}\trmi cdd active: {}'.format(cnt_pria_retest, cnt_pria_fp, cnt_rmi_cdd)
print cnt_dict


print
analysis(complete_df)

pria retest active: 79	pria fp active: 24	rmi cdd active: 230
{0: 72094, 1: 325, 2: 4}

retest: 0, fp: 0, rmi: 0 	--- 49489
retest: 0, fp: 0, rmi: nan 	--- 22605
retest: 0, fp: 1, rmi: 0 	--- 19
retest: 0, fp: 1, rmi: 1 	--- 3
retest: 1, fp: 0, rmi: 0 	--- 58
retest: 1, fp: 0, rmi: nan 	--- 20
retest: 0, fp: 1, rmi: nan 	--- 1
retest: 0, fp: 0, rmi: 1 	--- 227
retest: 1, fp: 1, rmi: nan 	--- 1


# Split data into k-fold

In [6]:
k = 3
directory = '../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('file_{}.csv'.format(i))
greedy_multi_splitting(complete_df, k, directory=directory, file_list=file_list)

9
(24141, 8)
(24142, 8)
(24140, 8)


In [7]:
k = 4
directory = '../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('file_{}.csv'.format(i))
greedy_multi_splitting(complete_df, k, directory=directory, file_list=file_list)

9
(18104, 8)
(18107, 8)
(18107, 8)
(18105, 8)


# Merge data from splitting folds to form training and testing

In [60]:
dtype_list = {'Molecule': np.str,
              'SMILES':np.str,
              'Fingerprints': np.str,
              'Keck_Pria_AS_Retest': np.int64,
              'Keck_Pria_FP_data': np.int64,
              'Keck_Pria_Continuous': np.float64,
              'Keck_RMI_cdd': np.float64}
output_file_list = [directory + f_ for f_ in file_list]
print output_file_list[:3]
train_pd = read_merged_data(output_file_list[:3])
print output_file_list[3]
test_pd = read_merged_data([output_file_list[3]])

['../dataset/fixed_dataset/fold_4/file_0.csv', '../dataset/fixed_dataset/fold_4/file_1.csv', '../dataset/fixed_dataset/fold_4/file_2.csv']
../dataset/fixed_dataset/fold_4/file_3.csv


In [57]:
print 'This is training set'
analysis(train_pd)
print
print 'This is test set'
analysis(test_pd)

This is training set
retest: 0, fp: 0, rmi: 0 	--- 37117
retest: 0, fp: 0, rmi: nan 	--- 16954
retest: 1, fp: 0, rmi: 0 	--- 44
retest: 0, fp: 1, rmi: 1 	--- 2
retest: 0, fp: 0, rmi: 1 	--- 170
retest: 0, fp: 1, rmi: 0 	--- 14
retest: 0, fp: 1, rmi: nan 	--- 1
retest: 1, fp: 0, rmi: nan 	--- 15
retest: 1, fp: 1, rmi: nan 	--- 1

This is test set
retest: 0, fp: 0, rmi: 0 	--- 12372
retest: 0, fp: 0, rmi: nan 	--- 5651
retest: 1, fp: 0, rmi: nan 	--- 5
retest: 0, fp: 1, rmi: 1 	--- 1
retest: 1, fp: 0, rmi: 0 	--- 14
retest: 0, fp: 0, rmi: 1 	--- 57
retest: 0, fp: 1, rmi: 0 	--- 5


# Test feature- and label- extraction

In [70]:
def extract_feature_and_label(data_pd,
                              feature_name,
                              label_name_list):
    X_data = np.zeros(shape=(data_pd.shape[0], 1024))
    y_data = np.zeros(shape=(data_pd.shape[0], len(label_name_list)))
    for index, row in data_pd.iterrows():
        feature = list(row[feature_name])
        labels = row[label_name_list]
        X_data[index] = np.array(feature)
        y_data[index] = np.array(labels)
    X_data = X_data.astype(float)
    y_data = y_data.astype(float)

    # In case we just train on one target
    # y would be (n,) vector
    # then we should change it to (n,1) 1D matrix
    # to keep consistency
    print y_data.shape
    if y_data.ndim == 1:
        n = y_data.shape[0]
        y_data = y_data.reshape(n, 1)

    return X_data, y_data


In [None]:
print train_pd.dtypes

X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=['Keck_Pria_AS_Retest', 'Keck_Pria_FP_data'])
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=['Keck_Pria_AS_Retest', 'Keck_Pria_FP_data'])

Molecule                   object
SMILES                     object
Fingerprints               object
Keck_Pria_AS_Retest         int64
Keck_Pria_FP_data           int64
Keck_Pria_Continuous      float64
Keck_RMI_cdd              float64
FP counts % inhibition    float64
dtype: object


# Test evaluation metrics

In [8]:
y_actual = np.eye(4)
y_pred = np.eye(4)
y_pred[2] = np.array([0, 0, 1, 1])
y_pred[3] = np.array([0, 0, 1, 0])

for i in range(4):
    print roc_auc_multi(y_actual, y_pred, [i], np.mean)
    print roc_auc_multi(y_actual, y_pred, [i], np.median)
    print enrichment_factor_multi(y_actual, y_pred, 0.5)
    print

1.0
1.0
[[1.0, 2.0], [1.0, 2.0], [1.0, 2.0], [1.0, 2.0]]

1.0
1.0
[[1.0, 2.0], [1.0, 2.0], [1.0, 2.0], [1.0, 2.0]]

0.833333333333
0.833333333333
[[1.0, 2.0], [1.0, 2.0], [1.0, 2.0], [1.0, 2.0]]

0.333333333333
0.333333333333
[[1.0, 2.0], [1.0, 2.0], [1.0, 2.0], [1.0, 2.0]]



# Get updated data, and give some analysis

In [9]:
discrete_file = pd.ExcelFile('../dataset/screening_smsf_actives.xlsx')
continuous_file = pd.ExcelFile('../dataset/screening_smsf_continuous.xlsx')

print discrete_file.sheet_names
print continuous_file.sheet_names

[u'Keck_Pria_Retest', u'Keck_Pria_FP', u'Keck_RMI', u'Xing_MTDH_Retest', u'Xing_MTDH_DR']
[u'Keck_Pria_Primary', u'Keck_RMI_cdd', u'Xing_MTDH_cdd']


In [10]:
binary_data = discrete_file.parse('Keck_Pria_Retest')
print 'reset binary labels:'
print binary_data.dtypes

print

continuous_data = continuous_file.parse('Keck_Pria_Primary')
print 'cts labels:'
print continuous_data.dtypes

reset binary labels:
Molecule               object
Keck_Pria_AS_Retest     int64
dtype: object

cts labels:
Molecule                 object
Keck_Pria_Continuous    float64
dtype: object


# List all active compounds (updated version)

In [11]:
print binary_data[binary_data['Keck_Pria_AS_Retest']>0]

            Molecule  Keck_Pria_AS_Retest
576    SMSSF-0015261                    1
7315   SMSSF-0021761                    1
13763  SMSSF-0027944                    1
16138  SMSSF-0030209                    1
16803  SMSSF-0030826                    1
16837  SMSSF-0030860                    1
16901  SMSSF-0030922                    1
16907  SMSSF-0030928                    1
19073  SMSSF-0032984                    1
25686  SMSSF-0039184                    1
26232  SMSSF-0039974                    1
26511  SMSSF-0040244                    1
29310  SMSSF-0042907                    1
29965  SMSSF-0043541                    1
30010  SMSSF-0043585                    1
30569  SMSSF-0044122                    1
30603  SMSSF-0044155                    1
30614  SMSSF-0044166                    1
30661  SMSSF-0044210                    1
30755  SMSSF-0044297                    1
30756  SMSSF-0044297                    1
30888  SMSSF-0044424                    1
31008  SMSSF-0044540              