In [2]:
import os
import pickle
import scipy.stats
import random
import sys
import json
import numpy as np
import re
import csv
import pandas as pd
os.chdir("/Users/leobrueggeman/GitHub/LINCS/l1ktools/python")
import cmap.io.gct as gct
import cmap.io.plategrp as grp
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import itertools

In [3]:
# set path to LINCS gctx data
path_to_gctx_file = '/Volumes/HimmelWD/lincs/modzs.gctx'
GCTObject = gct.GCT(path_to_gctx_file)

In [4]:
# create list of bing probes 
path_to_bing_file = '/Users/leobrueggeman/GitHub/LINCS/isbing_pr_genelist.json'
json_bing = json.loads(open(path_to_bing_file).read())
bing_list = list(json_bing)
print(bing_list)

[u'218075_at', u'218434_s_at', u'202852_s_at', u'205434_s_at', u'201511_at', u'201000_at', u'222064_s_at', u'202169_s_at', u'202170_s_at', u'210852_s_at', u'214829_at', u'209165_at', u'205986_at', u'209460_at', u'209459_s_at', u'203505_at', u'212772_s_at', u'204343_at', u'213353_at', u'219577_s_at', u'204719_at', u'209993_at', u'203192_at', u'209620_s_at', u'206317_s_at', u'214209_s_at', u'202804_at', u'202805_s_at', u'213485_s_at', u'206155_at', u'208161_s_at', u'209641_s_at', u'203196_at', u'209380_s_at', u'214033_at', u'202850_at', u'203981_s_at', u'201872_s_at', u'201873_s_at', u'200045_at', u'207622_s_at', u'209246_at', u'209247_s_at', u'202394_s_at', u'204567_s_at', u'211113_s_at', u'209735_at', u'218633_x_at', u'221927_s_at', u'210006_at', u'205566_at', u'221815_at', u'63825_at', u'87100_at', u'213017_at', u'218581_at', u'213805_at', u'213935_at', u'218739_at', u'221552_at', u'45288_at', u'209027_s_at', u'209028_s_at', u'207268_x_at', u'209856_x_at', u'211793_s_at', u'202123_s_a

In [4]:
# load and process drugbank to pubchem matching file
path_to_drugbank = '/Users/leobrueggeman/GitHub/LINCS/drugbank_pubchem_mapping.tsv'
drugbank_df = pd.read_csv(path_to_drugbank, sep = '\t', header=0)
drugbank_df = drugbank_df.drop_duplicates()
drugbank_df['pubchem_id'] = drugbank_df['pubchem_id'].astype(str)

In [5]:
# create dictionary of LINCS small molecule drugs with sig_ids and pubchem_cids
path_to_json = '/Users/leobrueggeman/GitHub/LINCS/lincs_small_molecules.json'
json_data = json.loads(open(path_to_json).read())

ref_dict = {}
for temp_dict in json_data:
    temp_pert = temp_dict['pert_id']
    temp_sig_gold = temp_dict['sig_id_gold']
    temp_sig = temp_dict['sig_id']
    if 'pubchem_cid' in temp_dict:
        temp_pubchem = temp_dict['pubchem_cid']
    else:
        temp_pubchem = 'NA'
    ref_dict[temp_pert] = {'sig_id_gold':temp_sig_gold,'sig_id':temp_sig, 'pubchem_id':temp_pubchem}

In [6]:
# create dataframe from ref_dict and delete entries without pubchem_ids
ref_frame = pd.DataFrame.from_dict(ref_dict, orient='index')
ref_frame['pert_id'] = ref_frame.index
ref_frame = ref_frame[ref_frame['pubchem_id'] != 'NA']
ref_frame['pubchem_id'] = ref_frame['pubchem_id'].astype(str)


In [7]:
# create data frame with only small molecule data in drugbank
merge_frame = ref_frame.merge(drugbank_df ,how='inner',on='pubchem_id')
merge_frame

Unnamed: 0,pubchem_id,sig_id_gold,sig_id,pert_id,drugbank_id
0,92151,[CPC015_MCF7_6H:BRD-A00546892-001-01-8:10],"[CPC004_A375_6H:BRD-A00546892-001-01-8:10, CPC...",BRD-A00546892,DB00810
1,3182,[BRAF001_A375_6H:BRD-A00827783-001-18-8:0.15625],"[CPD001_MCF7_6H:BRD-A00827783-001-18-8:10, CPD...",BRD-A00827783,DB00651
2,66368,"[CPC004_HA1E_24H:BRD-A00993607-003-15-4:10, CP...","[CPC004_A375_6H:BRD-A00993607-003-15-4:10, CPC...",BRD-A00993607,DB00866
3,3606,"[CPD002_MCF7_24H:BRD-A01078468-001-10-6:10, CP...","[CPD002_MCF7_24H:BRD-A01078468-001-10-6:10, CP...",BRD-A01078468,DB07931
4,3606,"[CPD002_MCF7_24H:BRD-A01078468-001-10-6:10, CP...","[CPD002_MCF7_24H:BRD-A01078468-001-10-6:10, CP...",BRD-A01078468,DB08959
5,46783268,"[CPC006_A375_24H:BRD-A01145011-001-01-4:11.1, ...","[CPC006_A375_6H:BRD-A01145011-001-01-4:11.1, C...",BRD-A01145011,DB03068
6,71684472,"[CPC014_A375_6H:BRD-A01317026-001-01-2:10, CPC...","[CPC014_A375_6H:BRD-A01317026-001-01-2:10, CPC...",BRD-A01317026,DB03271
7,71684472,"[CPC014_A375_6H:BRD-A01317026-001-01-2:10, CPC...","[CPC014_A375_6H:BRD-A01317026-001-01-2:10, CPC...",BRD-A01317026,DB04400
8,5152,"[CPC003_HA1E_24H:BRD-A01320529-001-05-9:10, CP...","[CPC009_A375_6H:BRD-A01320529-001-08-3:10, CPC...",BRD-A01320529,DB00938
9,5702178,"[CPC003_HA1E_24H:BRD-A01346607-001-03-4:10, CP...","[CPC003_HA1E_6H:BRD-A01346607-001-03-4:10, CPC...",BRD-A01346607,DB00223


In [8]:
# create dictionary from the merge_frame and collect cell line data
final_dict = {}
for i in range(len(merge_frame)):
    temp_cell_list = []
    temp_drugbank_id = merge_frame.iloc[i]['drugbank_id']
    temp_pert_id = merge_frame.iloc[i]['pert_id']
    temp_sig_id_gold = merge_frame.iloc[i]['sig_id_gold']
    temp_sig_id = merge_frame.iloc[i]['sig_id']
    
    for element in temp_sig_id_gold:
        temp_str = str(element)
        matchObj = re.match( r'(.*)_(.*)_.*', temp_str)
        temp_cell = matchObj.group(2)
        temp_cell_list.append(temp_cell)
                  
    temp_dict = final_dict.setdefault(temp_drugbank_id, {'pert_id': [], 'sig_id_gold': [], 'sig_id': [], 'gold_cells': []})
    temp_dict['pert_id'].append(temp_pert_id)
    temp_dict['sig_id_gold'].extend(temp_sig_id_gold)
    temp_dict['sig_id'].extend(temp_sig_id)
    temp_dict['gold_cells'].extend(temp_cell_list)
    temp_dict[temp_pert_id] = {'sig_id_gold' : temp_sig_id_gold, 'sig_id' : temp_sig_id}

In [5]:
#superset_list = ['A375', 'MCF7', 'VCAP', 'SKB', 'HCC515', 'ASC', 'HEPG2', 'NEU', 'PC3', 'NPC', 'HA1E', 'A549', 'HT29']
#superset_final_dict = final_dict.copy()
#for element in superset_final_dict:
    



NameError: name 'final_dict' is not defined

False

In [12]:
# save final_dict
#with open('final_dict.json', 'w') as fp:
#    json.dump(final_dict, fp)
with open('final_dict.dat', 'w') as outfile:
    pickle.dump(final_dict, outfile)


In [10]:
# load final_dict
with open('final_dict.dat', 'r') as fp:
    final_dict = json.load(fp)

In [9]:
# write this to take columns IDs, create column ids elsewhere
def pert_to_cid(pert_id_here):
    column_ids = ref_dict[pert_id_here]['sig_id_gold']
    return column_ids
        
def cid_to_matrix(cid_here):
    ## extract only the specified columns from the matrix
    GCTObject = gct.GCT(path_to_gctx_file)
    GCTObject.read_gctx_matrix(cid = cid_here, rid = bing_list)
    pert_matrix = GCTObject.matrix
    return pert_matrix

def matrix_to_corr_matrix(matrix_here):
    corr_matrix = scipy.stats.spearmanr(matrix_here)
    corr_matrix_2 = corr_matrix[0]
    return corr_matrix_2

def corr_matrix_to_sig_corr_value_and_total_corr(corr_matrix):
    output_list = []
    temp_len = len(corr_matrix)
    for i in range(temp_len):
        temp_sum = corr_matrix[:,i].sum() - 1
        temp_mean = temp_sum / (temp_len - 1)
        output_list.append(temp_mean)
    total_corr = np.mean(output_list)  
    output_list /= sum(output_list)
    return output_list, total_corr

def sig_corr_value_to_expr_sig(matrix_here, sig_corr_value):
    temp_len = matrix_here.shape[1]
    matrix_here *= sig_corr_value
    matrix_here = np.sum(matrix_here, axis=1)
    return matrix_here.tolist()
    

In [10]:
# fill in final_dict with consensus signatures, correlation matrices, signature correlation values
# and total correlation values
#NEED TO FIX THIS, the TEMP_MATRIX.TOLIST() DOESN"T WORK AS INTENDED

for key in final_dict:
    temp_cid = final_dict[key]['sig_id_gold']
    if len(temp_cid) > 0:
        temp_matrix = cid_to_matrix(temp_cid)
        if temp_matrix.shape[1] > 2:
            temp_corr_matrix = matrix_to_corr_matrix(temp_matrix)
            temp_sig_corr_value , temp_total_corr = corr_matrix_to_sig_corr_value_and_total_corr(temp_corr_matrix)
            temp_cons_expr_sig = sig_corr_value_to_expr_sig(temp_matrix, temp_sig_corr_value)
        elif temp_matrix.shape[1] > 1:
            # if there are only two gold id's then each contribute equally to the temp_cons_expr_sig hence the 0.5
            temp_corr_matrix = matrix_to_corr_matrix(temp_matrix)
            temp_total_corr = temp_corr_matrix
            temp_cons_expr_sig = sig_corr_value_to_expr_sig(temp_matrix, 0.5)
            temp_sig_corr_value = "NA"
        else:
            temp_cons_expr_sig = temp_matrix.tolist()
            temp_corr_matrix = "NA"
            temp_sig_corr_value = "NA"
            temp_total_corr = "NA"
    else: 
        temp_matrix = 'NA'
        temp_cons_expr_sig = "NA"
        temp_corr_matrix = "NA"
        temp_sig_corr_value = "NA"
        temp_total_corr = "NA"
    final_dict[key]['cons_expr_sig'] = temp_cons_expr_sig
    final_dict[key]['corr_matrix'] = temp_corr_matrix
    final_dict[key]['sig_corr_value'] = temp_sig_corr_value
    final_dict[key]['total_corr'] = temp_total_corr

                                                                                

In [17]:
temper = final_dict['DB06694']['cons_expr_sig']
temper

[[-0.5714874267578125],
 [0.318004310131073],
 [1.1675957441329956],
 [-0.6283757090568542],
 [-0.7968820929527283],
 [0.1025078296661377],
 [0.04888811707496643],
 [0.3217694163322449],
 [0.8748001456260681],
 [0.37216824293136597],
 [-0.2545062303543091],
 [0.3906972408294678],
 [-0.44795113801956177],
 [-0.6795207262039185],
 [-0.4136042594909668],
 [-0.493194580078125],
 [-0.606274425983429],
 [0.36428970098495483],
 [0.15276554226875305],
 [-0.8246642351150513],
 [0.46183592081069946],
 [0.019741788506507874],
 [0.35816922783851624],
 [0.7827053070068359],
 [-0.9311938285827637],
 [0.6581257581710815],
 [0.4512097239494324],
 [-0.03604283183813095],
 [0.40300995111465454],
 [-0.92401522397995],
 [0.20891451835632324],
 [0.013060778379440308],
 [-0.03648493066430092],
 [0.2594090700149536],
 [-0.029857590794563293],
 [0.04280000925064087],
 [-0.3466441333293915],
 [0.4398200809955597],
 [0.42681869864463806],
 [-0.38612639904022217],
 [-0.002735435962677002],
 [-0.7737563252449036]

In [107]:
# create set of approved drugbank drugs, then find number of drugs in final_dict are in this 
# approved drug set
approved_db = pd.read_csv('/Users/leobrueggeman/GitHub/LINCS/drug_links.csv', sep = ',', header=0)
approved_db_set = set(x for x in approved_db['DrugBank ID'])
print(len(approved_db_set))

total_set = approved_db_set & set(final_dict)
len(total_set)

1600


899

In [None]:
# use this to create dict with all of LINCS small molecule data, make sure you don't change the ref_dict
# above to only use drug bank compounds
# Randomly subset ref_dict with new_dict

new_dict = random.sample(ref_dict.keys(), 100)
 
main_dict = {}
for key in new_dict:
    print(key)
    temp_cid = pert_to_cid(key)
    temp_matrix = cid_to_matrix(temp_cid)
    if temp_matrix.shape[1] > 1:
        temp_corr_matrix = matrix_to_corr_matrix(temp_matrix)
        temp_sig_corr_value, temp_total_corr = corr_matrix_to_sig_corr_value_and_total_corr(temp_corr_matrix)
        temp_cons_expr_sig = sig_corr_value_to_expr_sig(temp_matrix, temp_sig_corr_value)
    else:
        temp_corr_matrix = "NA"
        temp_sig_corr_value = "NA"
        temp_total_corr = "NA"
        temp_cons_expr_sig = list(temp_matrix)
    main_dict[key] = {'sig_ids':temp_cid, 'cor_matrix':temp_corr_matrix, 'cons_expr_sig':temp_cons_expr_sig, 'sig_corr':temp_sig_corr_value, 'total_corr':temp_total_corr}

BRD-K28530075
                                                                                BRD-K50186544
                                                                                BRD-K30797133
[33mGCTX_READER[m:reading matrix data

In [109]:
%matplotlib inline

In [93]:
#print(main_dict)
greater_2_list = []
lesser_2_list = []
for element in main_dict:
    temp_sig_test = main_dict[element]['expr_sig']
    temp_sig_test = np.asarray(temp_sig_test)
    print((temp_sig_test > 2).sum())
    #print(sum((temp_sig_test < -2)))


17
14
45
0
16
9
7
24
16
26
10
707
9
8
19
12
8
12
3
11
1
0
5
1155
10
44
7
17
0
41
1212
0
18
17
4
0
5
17
0
636
63
28
1339
12
33
0
1
9
21
34
7
16
12405
4
16
138
1480
10
11
9
24
14
7
9282
4
17
3
1105
1
12
484
2
2338
3
5535
3946
8548
183
6
12546
17
13
31
165
26
0
16
5
8
926
3272
1
9
1051
2
1
17
221
3
8
37
197
31
26
434
47
3
2943
2
5
12
175
1
5
4
0
98
491
3
20
32
23
5282
4
2
768
3
5
11
31
36
9
3
14
30
602
270
92
12
0
18
132
10
0
2
4
12
2
23
534
1124
12148
2
7281
395
9
2250
132
0
9
586
9
13
26
1185
243
40
15
11049
308
14698
524
15
106
59
7
31
0
1
8
11
7004
5321
9
5
5
75
952
11829
3348
16
4414
29
21
4
12
23
38
19
1101
1748
1
18
1
36
15
616
8321
23
5121
10
4310
2
11
0
6
1020
4273
28
17
31
36
1
88
4
43
13
0
1
12
1039
9
13
35
6
98
422
1
2
5324
41
191
8
5
647
14
10
9
10
9
45
98
1205
12
78
69
2
30
23
527
20
5
16
1
127
9
2
2714
13
40
5
23
39
2893
47
19
17
22
3563
12
3
10
1031
0
1
19
26
57
25
1
15
25
0
0
1
13
0
43
35
229
25
2
1848
12
11
12644
981
7
3
34
0
12
54
4
46
13
0
6702
4
5
8
7
17
17
25
19
1
28

In [1]:
#rowlist = GCTObject.get_rids()
#print(rowlist)
#len(rowlist)

['ind', 'id']
['ind', 'id']


In [2]:
#path_to_json = '/Users/leobrueggeman/GitHub/LINCS/lincs_small_molecules.json'
#json_data = json.loads(open(path_to_json).read())
#print(json_data)

In [46]:
sig_id_list = json_data[1]
temp = sig_id_list['pert_id']
print(temp)

BRD-K18814832
