In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from scipy import stats

In [2]:
genes = pd.read_csv('../data/gene2ind.txt', sep='\t', header=None, names=(['I', 'G']))['G']
cell_gene_matrix = np.loadtxt('../data/cell2mutation.txt', delimiter=',')
cell_index = pd.read_csv("../data/cell2ind.txt", sep="\t", header=None, names=['I', 'C'])
test_df = pd.read_csv("../data/drugcell_all.txt", sep='\t', header=None, names=['C', 'D', 'AUC'])
drugs = pd.read_csv('../data/drug2ind.txt', sep='\t', header=None, names=['I', 'D'])['D']
ontology = pd.read_csv('../data/drugcell_ont.txt', sep='\t', header=None, names=['S', 'T', 'I'])

predicted_vals = np.loadtxt('../rlipp/drugcell_all.predict')


terms = ontology['S'].unique().tolist()

In [3]:
#creating hidden files for genes to be used in rlipp

cell_id_map = dict(zip(cell_index['C'], cell_index['I']))
cell_line_ids = np.array([cell_id_map[x] for x in test_df['C'].tolist()])
for i, gene in enumerate(genes):
    file_name = '../rlipp/hidden/' + gene + '.hidden'
    mat_data_sub = cell_gene_matrix[cell_line_ids, i].ravel()
    np.savetxt(file_name, mat_data_sub, fmt='%.3f')

In [5]:
#Separating data for each drug
drug_pos_map = {d:[] for d in drugs}
for i, row in test_df.iterrows():
    drug_pos_map[row['D']].append(i)


In [8]:
def get_features(term, index_list):
    file_name = '../rlipp/hidden/' + term + '.hidden'
    features = []
    if term in terms:
        features = np.loadtxt(file_name, usecols=range(6))
    else:
        features = np.loadtxt(file_name, usecols=range(1))
    return np.take(features, index_list, axis=0)


def get_child_features(term, index_list):
    child_features = []
    children = [row['T'] for _,row in ontology.iterrows() if row['S']==term]
    for child in children:
        child_features.append(get_features(child, index_list))
    return np.column_stack((f for f in child_features))


def exec_lm(X, y):
    regr = RidgeCV(fit_intercept=False, cv=5)
    regr.fit(X, y)
    y_pred = regr.predict(X)
    return stats.spearmanr(y_pred, y)[0]

In [None]:
f = open('../rlipp/rlipp.out', "w")
f.write('Drug\tTerm\tP_rho\tC_rho\tRLIPP\n')

for i,d in enumerate(drugs):
    y = np.take(predicted_vals, drug_pos_map[d])
    for j,t in enumerate(terms):
        X_parent = get_features(t, drug_pos_map[d])
        X_child = get_child_features(t, drug_pos_map[d])
        p_rho = exec_lm(X_parent, y)
        c_rho = exec_lm(X_child, y)
        rlipp = (p_rho - c_rho)/c_rho
        result = '{}\t{}\t{:.3f}\t{:.3f}\t{:.3f}\n'.format(d, t, p_rho, c_rho, rlipp)
        f.write(result)
f.close()