In [1]:
from viz import render_bel_with_igraph
import pandas as pd
import numpy as np
from tqdm import tqdm

# Get target and transcriptomic vectors

In [2]:
transcriptomic_responses_df = pd.read_csv(
    '../data/Transcriptional_data_frames/transcriptional_response_vectors.tsv',
    sep='\t',
    index_col=0,
)
transcriptomic_responses_df.head(4)
transcriptomic_responses_df.shape


(2152, 4938)

In [3]:
transcriptomic_responses_df.head(1)

Unnamed: 0,AAAS,AADAC,AADACL2,AADAT,AASS,AATF,ABAT,ABCA1,ABCB1,ABCB11,...,ZSCAN32,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSCAN5DP,ZSCAN9,ZXDA,ZXDB,ZXDC,ZZZ3
CID00001,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,-1,0,0,0,-1


In [4]:
targets_df = pd.read_csv(
    '../data/target_data_frames/target_vectors_Chempert.tsv',
    sep='\t',
    index_col=0,
)
targets_df.head()

Unnamed: 0,AAAS,AADAC,AADACL2,AADAT,AASS,AATF,ABAT,ABCA1,ABCB1,ABCB11,...,ZSCAN32,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSCAN5DP,ZSCAN9,ZXDA,ZXDB,ZXDC,ZZZ3
CID00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Get DEGs for each compound

In [5]:
compound_1 = 'CID00374'
compound_2 = 'CID01756'

In [6]:
transcript_1 = transcriptomic_responses_df.loc[compound_1]
transcript_2 = transcriptomic_responses_df.loc[compound_2]
target_1 = targets_df.loc[compound_1]
target_2 = targets_df.loc[compound_2]

In [7]:
all_genes = set(np.nonzero(list(transcript_1))[0]).union(set(np.nonzero(list(transcript_2))[0]))
all_genes = set(all_genes).union(np.nonzero(list(target_1))[0])
all_genes = set(all_genes).union(np.nonzero(list(target_2))[0])
all_gene_names = [transcriptomic_responses_df.columns[i] for i in all_genes]

# Create dictionary of gene categories

In [8]:
node_categories = {}

for gene in tqdm(all_genes):
    gene_names = transcriptomic_responses_df.columns
    if target_1[gene] != 0 or target_2[gene] != 0:
        node_categories[gene_names[gene]] = 'target'
    elif transcript_1[gene] != 0 and transcript_2[gene] != 0:
        if transcript_1[gene] == transcript_2[gene]:
            node_categories[gene_names[gene]] = 'matching'
        else:
            node_categories[gene_names[gene]] = 'non_matching'
    elif transcript_1[gene] != 0:
            node_categories[gene_names[gene]] = 'present_in_1_but_not_in_2'
    elif transcript_2[gene] != 0:
            node_categories[gene_names[gene]] = 'present_in_2_but_not_in_1'

100%|██████████| 829/829 [00:00<00:00, 61207.54it/s]


# Create data frame that only includes our DEGs

In [9]:
df = pd.read_csv('../data/Protein_pathways/Kegg_network.tsv', sep='\t')

df = df.loc[df['source_database'].apply(lambda x: x == 'KEGG')]
df['source'] = ['hgnc:' + x.split(':')[1] for x in df['source']]
df['target'] = ['hgnc:' + x.split(':')[1] for x in df['target']]

#Get 1st map to map hgnc numbers to NCBI Codes
map_1_df = pd.read_csv('../data/Protein_mappings/proteins_uniprot.tsv', sep='\t', header=None)
map_1_df = map_1_df.rename({0: 'code', 1:'NCBI'}, axis=1)
map_1_keys = list(map_1_df['code'])
map_1_values = list(map_1_df['NCBI'])
map_1_dict = {map_1_keys[i]: map_1_values[i] for i in range(len(map_1_keys))}

#Get 2nd map to map NCBI codes to HGNC symbols
map_2_df = pd.read_csv('../data/Protein_mappings/proteins.tsv', sep='\t', header=None)
map_2_df = map_2_df.rename({0: 'NCBI', 2:'HGNC'}, axis=1)
map_2_keys = list(map_2_df['NCBI'])
map_2_values = list(map_2_df['HGNC'])
map_2_dict = {map_2_keys[i]: map_2_values[i] for i in range(len(map_2_keys))}

def dict_func(x):
    try:
        result = map_2_dict[map_1_dict[x]]
    except:
        result = 'None'
    return result

#Convert directional relationships table to HGNC symbols
df['source'] = df['source'].map(lambda x: dict_func(x))
df['target'] = df['target'].map(lambda x: dict_func(x))
df.shape

(27047, 4)

In [10]:
for index in tqdm(df.index.values):
    if df.loc[index,'source'] not in all_gene_names or df.loc[index,'target'] not in all_gene_names:
        df = df.drop(index, axis=0)

100%|██████████| 27047/27047 [00:34<00:00, 776.38it/s] 


In [11]:
render_bel_with_igraph('../figures/figure_5.png', node_categories, all_gene_names, df)