# Protein Pathways

This notebook uses different protein pathway databases to adjust the ChemPert target data frame from the chempert_preprocessing notebook

In [2]:
#This is the database you will be using to adjust the target data frame
#options: 'kegg_network', 'directional', 'kegg', 'kegg_directional_combined'
current_database = 'kegg'

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook, tqdm
from matplotlib import pyplot as plt

# Get Protein Pathway Databases

### Directional Protein Pathways (Inhibits/Activates)

In [4]:
#Directional relationship data set
#https://github.com/enveda/kgem-ensembles-in-drug-discovery/blob/e237c41e6e6c477b7013519da115800d36609c8b/data/kg/custom_kg/train.tsv
directional_df = pd.read_csv('../data/Protein_pathways/train_2.txt', sep = '\t', header=None)
directional_df.rename(columns = {0:'protein_1', 1: 'relationship', 2: 'protein_2'}, inplace = True)
directional_df = directional_df.loc[directional_df['protein_1'].apply(lambda x: x.startswith('HGNC'))]
directional_df = directional_df.loc[directional_df['protein_2'].apply(lambda x: x.startswith('HGNC'))]
directional_df['protein_1'] = ['hgnc:' + x.split(':')[1] for x in directional_df['protein_1']]
directional_df['protein_2'] = ['hgnc:' + x.split(':')[1] for x in directional_df['protein_2']]
directional_df.head()

Unnamed: 0,protein_1,relationship,protein_2
9914,hgnc:12630,inhibits,hgnc:11998
9915,hgnc:6871,activates,hgnc:990
9916,hgnc:6367,inhibits,hgnc:3661
9917,hgnc:6973,activates,hgnc:9077
9918,hgnc:6973,activates,hgnc:14402


### Kegg Pathways

In [5]:
#KEGG pathway data set
#https://github.com/pathwayforte/results/blob/master/input/gmt_files/merged_geneset_final.gmt
kegg_df = pd.read_csv('../data/Protein_pathways/kegg_data.csv', sep=',', header=None, low_memory=False)
kegg_df = kegg_df.drop([0,1], axis=1)
print(kegg_df.shape)
for i in kegg_df.index.values:
    current_pathway = kegg_df.loc[i]
    current_pathway = [x for x in current_pathway if not pd.isna(x)]
    if len(current_pathway) > 300:
        kegg_df = kegg_df.drop(i, axis=0)
print(kegg_df.shape)        
kegg_df.head()

(2896, 2712)
(2842, 2712)


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,2704,2705,2706,2707,2708,2709,2710,2711,2712,2713
0,ALDH1A3,PDHA2,PFKM,HK1,ACSS1,GOT1,MDH1,PGAM4,PCK1,GALM,...,,,,,,,,,,
1,PDHA2,FAHD1,MDH1,IDH3B,ACO1,PCK1,NNT,IDH3G,OGDH,DLAT,...,,,,,,,,,,
2,PFKM,PRPS2,SHPK,TKT,RPE,PGM1,TKTL1,RPEL1,RBKS,PGM2,...,,,,,,,,,,
3,UGT2A3,UGT1A6,DCXR,UGT1A1,UGT1A10,RPE,DHDH,UGT1A4,AKR1B10,AKR1A1,...,,,,,,,,,,
4,PFKM,HK1,TKFC,PFKFB1,FUK,PFKFB3,GMPPB,AKR1B10,MPI,GMDS,...,,,,,,,,,,


### Kegg directional network

In [6]:
#KEGG directional relationships
kegg_network = pd.read_csv('../data/Protein_pathways/Kegg_network.tsv', sep='\t')
kegg_network = kegg_network.loc[kegg_network['source_database'].apply(lambda x: x == 'KEGG')]
kegg_network['source'] = ['hgnc:' + x.split(':')[1] for x in kegg_network['source']]
kegg_network['target'] = ['hgnc:' + x.split(':')[1] for x in kegg_network['target']]
kegg_network.head()

Unnamed: 0,source,target,relation,source_database
25514,hgnc:7835,hgnc:4195,1,KEGG
25515,hgnc:7835,hgnc:5329,1,KEGG
25516,hgnc:7835,hgnc:6081,1,KEGG
25517,hgnc:13633,hgnc:24041,1,KEGG
25518,hgnc:13633,hgnc:9232,1,KEGG


# Get HGNC Symbols for Proteins

### Map protein codes to NCBI codes

In [7]:
#Get 1st map to map proteins to NCBI Codes
map_1_df = pd.read_csv('../data/Protein_mappings/proteins_uniprot.tsv', sep='\t', header=None)
map_1_df = map_1_df.rename({0: 'code', 1:'NCBI'}, axis=1)
map_1_df.head()

#Create dictionary mapping protein codes to NCBI codes
map_1_keys = list(map_1_df['code'])
map_1_values = list(map_1_df['NCBI'])
map_1_dict = {map_1_keys[i]: map_1_values[i] for i in range(len(map_1_keys))}

### Map NCBI codes to HGNC symbols

In [8]:
#Get 2nd map to map proteins to HGNC symbols
map_2_df = pd.read_csv('../data/Protein_mappings/proteins.tsv', sep='\t', header=None)
map_2_df = map_2_df.rename({0: 'NCBI', 2:'HGNC'}, axis=1)
map_2_df.head()

#Create dictionary mapping NCBI codes to HGNC codes
map_2_keys = list(map_2_df['NCBI'])
map_2_values = list(map_2_df['HGNC'])
map_2_dict = {map_2_keys[i]: map_2_values[i] for i in range(len(map_2_keys))}

### Map all drugs in databases to HGNC codes

In [9]:
#Create function that maps the protein to HGNC symbols
def dict_func(x):
    try:
        result = map_2_dict[map_1_dict[x]]
    except:
        result = 'None'
    return result

#Convert directional relationships table to HGNC symbols
directional_df['sources'] = directional_df['protein_1'].map(lambda x: dict_func(x))
directional_df['targets'] = directional_df['protein_2'].map(lambda x: dict_func(x))
print("Number of Unmapped proteins from directional_df: " + str(list(directional_df['sources']).count('None')))
print("Number of proteins in directional_df: " + str(directional_df.shape[0]))
directional_df.loc[directional_df['sources'].apply(lambda x: x == 'None')].iloc[:10,[0,3]]

#Convert KEGG directional database to HGNC symbols
kegg_network['source'] = kegg_network['source'].map(lambda x: dict_func(x))
kegg_network['target'] = kegg_network['target'].map(lambda x: dict_func(x))
print("Number of Unmapped proteins from kegg network: " + str(list(kegg_network['source']).count('None')))
print("Number of proteins in kegg network: " + str(kegg_network.shape[0]))

Number of Unmapped proteins from directional_df: 50
Number of proteins in directional_df: 50764
Number of Unmapped proteins from kegg network: 0
Number of proteins in kegg network: 27047


# Get Chempert data

In [10]:
#Get transcriptomic data frame
transcriptomic_responses_df = pd.read_csv(
    '../data/Transcriptional_data_frames/transcriptional_response_vectors_original.tsv',
    sep='\t',
    index_col=0,
)
transcriptomic_responses_df.head(4)
transcriptomic_responses_df.shape

(2152, 4939)

In [11]:
#Get targets data frame
targets_df = pd.read_csv(
    '../data/target_data_frames/target_vectors_Chempert.tsv',
    sep='\t',
    index_col=0,
)
targets_df.head(4)

Unnamed: 0,AAAS,AADAC,AADACL2,AADAT,AASS,AATF,ABAT,ABCA1,ABCB1,ABCB11,...,ZSCAN32,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSCAN5DP,ZSCAN9,ZXDA,ZXDB,ZXDC,ZZZ3
CID00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Add Protein Pathways to Target Data Frame

### Adjust targets_df based on KEGG protein pathways

In [11]:
if current_database == 'kegg':
    #Set initial variables
    new_targets_df = targets_df.copy()

    #Get all drug targets
    for drug in tqdm_notebook(targets_df.index.values):
        drug_array = np.nonzero(np.array(targets_df.loc[drug]))[0]

        #Get all KEGG pathways the targets are in 
        for index in drug_array:
            protein = targets_df.columns[index]
            related_proteins = kegg_df[kegg_df.isin([protein]).any(axis=1)].to_numpy()
            
            #Adjust genes related to target in new_targets_df
            if list(related_proteins):
                related_proteins = related_proteins.flatten()
                related_proteins = related_proteins[~(pd.isna(related_proteins))]
                related_proteins = np.unique(np.array(related_proteins))
                related_proteins = [x for x in related_proteins if (x in targets_df.columns \
                                                                   and new_targets_df.loc[drug, x] == 0)]
                new_targets_df.loc[drug, related_proteins] = [targets_df.loc[drug, protein] \
                                                              for x in related_proteins]
    
    new_targets_df.to_csv(r'../data/target_data_frames/target_vectors_KEGG.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for drug in tqdm_notebook(targets_df.index.values):


  0%|          | 0/2152 [00:00<?, ?it/s]

### Get protein relationships from combined kegg pathways and directional pathways

In [12]:
if current_database == 'kegg_directional_combined':
    #Create new combined dataframe
    combined = pd.DataFrame(index = kegg_df.index.values, columns = kegg_df.columns)
    for col in tqdm(kegg_df.columns):
        combined[col] = kegg_df[col].copy().map(lambda x: [x,1] if not pd.isna(x) else [x,0])

    #Add directional protein relationships to KEGG pathways
    for relationship in tqdm_notebook(directional_df.index.values):
        direction = directional_df.loc[relationship,'relationship']
        if direction == 'inhibits':
            protein_1 = directional_df.loc[relationship,'HGNC_1']
            protein_2 = directional_df.loc[relationship,'HGNC_2']
            current_kegg_rows = kegg_df[kegg_df.isin([protein_1]).any(axis=1)]
            current_kegg_rows = current_kegg_rows[current_kegg_rows.isin([protein_2]).any(axis=1)]
            num_rows = current_kegg_rows.shape[0]
            if num_rows != 0:
                for index in current_kegg_rows.index.values:
                    combined.loc[index] = combined.loc[index].map(lambda x: [x[0],-1] if (x[0] == protein_2) else x)

    #Set initial variables
    new_targets_df = targets_df.copy()

    #Adjust target vectors based on combined kegg and directional databases
    for drug in tqdm_notebook(targets_df.index.values):
        drug_array = np.nonzero(np.array(targets_df.loc[drug]))[0]

        #Get drug targets
        for index in drug_array:
            protein = targets_df.columns[index]
            protein_rows = kegg_df[kegg_df.isin([protein]).any(axis=1)]
            
            #Get related proteins and their protein scores (+/- 1)
            for index in protein_rows.index.values:
                related_proteins = np.array(protein_rows.loc[index])
                related_proteins = related_proteins[~(pd.isna(related_proteins))]
                protein_scores = combined.loc[index].map(lambda x: x[1])
                protein_scores = list(filter(lambda x: x != 0, protein_scores))

                if list(related_proteins):
                    protein_scores = [protein_scores[i] for i in range(len(protein_scores))\
                                     if (related_proteins[i] in targets_df.columns and \
                                         new_targets_df.loc[drug, related_proteins[i]] == 0)]
                    related_proteins = [x for x in related_proteins if (x in targets_df.columns \
                                                                       and new_targets_df.loc[drug, x] == 0)]
                    
                #Adjust target data frame based on combined KEGG and directional pathways
                if targets_df.loc[drug, protein] == 1:
                    new_targets_df.loc[drug, related_proteins] = [protein_scores[i] for i in \
                                                                  range(len(related_proteins))]
                elif targets_df.loc[drug, protein] == -1:
                    new_targets_df.loc[drug, related_proteins] = [-protein_scores[i] for i in \
                                                                  range(len(related_proteins))]
                    
    new_targets_df.to_csv(r'../data/target_data_frames/target_vectors_KEGG_directional_combined.csv')

### Get directional protein pathways (apply this multiple times)

In [13]:
if current_database == 'directional':
    #Set initial variables
    sources = list(directional_df['sources'])
    targets = list(directional_df['targets'])
    correlation = True
    new_targets_df = targets_df.copy()
    
    #Set number of levels to repeatedly apply the directional database
    num_levels = 1
    
    #Add protein pathways from directional pathways. Num_levels is the number of times to repeat this process
    for i in range(num_levels):

        #Get related proteins
        for protein in tqdm_notebook(targets_df.columns):
            if protein in sources:
                protein_array = np.nonzero(np.array(targets_df[protein]))[0]
                protein_indexes = [i for i,x in enumerate(sources) if x == protein]
                related_proteins = [[targets[i],i] for i in protein_indexes if \
                                    targets[i] != 'None' and targets[i] in targets_df.columns]
                
                #Change protein values in the target data frame
                if list(protein_array) and related_proteins:
                    for drug_index in protein_array:
                        drug = targets_df.index.values[drug_index]
                        protein_array_value = targets_df.loc[drug, protein]
                        new_targets_df.loc[drug, np.array(related_proteins)[:,0]] = [protein_array_value if \
                                                            directional_df.iloc[protein_2[1],1] == 'activates'\
                                                            else -protein_array_value for protein_2 \
                                                            in related_proteins]



        targets_df = new_targets_df.copy()
        targets_df.to_csv(r'../data/target_data_frames/target_vectors_direction_repeated_' \
                          + str(i+1) + '_times.csv')


### Get KEGG network protein pathways (apply this multiple times) 

In [14]:
if current_database == 'kegg_network':
    #Set initial variables
    sources = list(kegg_network['source'])
    targets = list(kegg_network['target'])
    correlation = True
    new_targets_df = targets_df.copy()
    
    #Set number of levels to repeatedly apply the kegg network database
    num_levels = 5

    #Add protein pathways from directional pathways. Num_levels is the number of times to repeat this process
    for i in range(num_levels):

        for protein in tqdm_notebook(targets_df.columns):

            #Get drugs that target this protein and the proteins that 
            #this interact with this protein
            if protein in sources:
                protein_array = np.nonzero(np.array(targets_df[protein]))[0] 
                protein_relationship_indexes = [i for i,x in enumerate(sources) if x == protein] 
                related_proteins = [[targets[i],i] for i in protein_relationship_indexes if \
                                    targets[i] != 'None' and targets[i] in targets_df.columns]

                #Change values of neighboring proteins in target data frame
                if list(protein_array) and related_proteins:
                    for drug_index in protein_array:
                        drug = targets_df.index.values[drug_index]
                        protein_array_value = targets_df.loc[drug, protein]
                        new_targets_df.loc[drug, np.array(related_proteins)[:,0]] = [protein_array_value if \
                                                                kegg_network.iloc[protein_2[1],1] == 1 else \
                                                                -protein_array_value for protein_2 in \
                                                                related_proteins]


        targets_df = new_targets_df.copy()
        targets_df.to_csv(r'../data/target_data_frames/target_vectors_KEGG_direction_repeated_' \
                          + str(i+1) + '_times.csv')
