# Protein Pathways

This notebook uses random protein networks to adjust the ChemPert target data frame from the chempert_preprocessing notebook

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook, tqdm
from matplotlib import pyplot as plt

# Get original directional network

In [4]:
#Original directional network
kegg_network = pd.read_csv('../data/Protein_pathways/Kegg_network.tsv', sep='\t')
kegg_network = kegg_network.loc[kegg_network['source_database'].apply(lambda x: x == 'KEGG')]
kegg_network = kegg_network.drop('source_database', axis=1)
kegg_network['source'] = ['hgnc:' + x.split(':')[1] for x in kegg_network['source']]
kegg_network['target'] = ['hgnc:' + x.split(':')[1] for x in kegg_network['target']]

# Get HGNC Symbols for Genes

### Map protein codes to NCBI codes

In [5]:
#Get 1st map to map proteins to NCBI Codes
map_1_df = pd.read_csv('../data/Protein_mappings/proteins_uniprot.tsv', sep='\t', header=None)
map_1_df = map_1_df.rename({0: 'code', 1:'NCBI'}, axis=1)
map_1_df.head()

#Create dictionary mapping protein codes to NCBI codes
map_1_keys = list(map_1_df['code'])
map_1_values = list(map_1_df['NCBI'])
map_1_dict = {map_1_keys[i]: map_1_values[i] for i in range(len(map_1_keys))}

### Map NCBI codes to HGNC symbols

In [6]:
#Get 2nd map to map proteins to HGNC symbols
map_2_df = pd.read_csv('../data/Protein_mappings/proteins.tsv', sep='\t', header=None)
map_2_df = map_2_df.rename({0: 'NCBI', 2:'HGNC'}, axis=1)
map_2_df.head()

#Create dictionary mapping NCBI codes to HGNC codes
map_2_keys = list(map_2_df['NCBI'])
map_2_values = list(map_2_df['HGNC'])
map_2_dict = {map_2_keys[i]: map_2_values[i] for i in range(len(map_2_keys))}

### Map all drugs in databases to HGNC codes

In [7]:
#Create function that maps the protein to HGNC symbols
def dict_func(x):
    try:
        result = map_2_dict[map_1_dict[x]]
    except:
        result = 'None'
    return result

#Convert KEGG directional database to HGNC symbols
kegg_network['source'] = kegg_network['source'].map(lambda x: dict_func(x))
kegg_network['target'] = kegg_network['target'].map(lambda x: dict_func(x))
print("Number of Unmapped proteins from kegg network: " + str(list(kegg_network['source']).count('None')))
print("Number of proteins in kegg network: " + str(kegg_network.shape[0]))

Number of Unmapped proteins from kegg network: 0
Number of proteins in kegg network: 27047


# Get Chempert data

In [9]:
#Get transcriptomic data frame
transcriptomic_responses_df = pd.read_csv(
    '../data/Transcriptional_data_frames/transcriptional_response_vectors.tsv',
    sep='\t',
    index_col=0,
)

In [8]:
#Get targets data frame
targets_df = pd.read_csv(
    '../data/target_data_frames/target_vectors_Chempert.tsv',
    sep='\t',
    index_col=0,
)

# Adjust the target data frames with the random networks 

In [208]:
for j in tqdm_notebook(range(1,101)):
    
    #Get targets data frame
    targets_df = pd.read_csv(
        '../data/target_data_frames/target_vectors_Chempert.tsv',
        sep='\t',
        index_col=0,
    )
    
    # Get random network
    current_network = pd.read_csv('../shuffled/random_' + str(j) + '.tsv', sep='\t')
    current_network['source'] = current_network['source'].map(lambda x: dict_func(x))
    current_network['target'] = current_network['target'].map(lambda x: dict_func(x))
    current_network = pd.merge(current_network,kegg_network, indicator=True, how='outer')\
                                 .query('_merge=="left_only"')\
                                 .drop('_merge', axis=1)
    current_network = current_network.dropna()
    
    #Set initial variables
    sources = list(current_network['source'])
    targets = list(current_network['target'])
    relations = np.array(current_network['relation'])
    protein_relationship_indexes = {protein: [i for i,x in enumerate(sources) if x==protein] 
                                        for protein in targets_df.columns}
    correlation = True
    new_targets_df = targets_df.copy()
    
    #Set number of levels to repeatedly apply the kegg network database
    num_levels = 3

    #Add protein pathways from directional pathways. Num_levels is the number of times to repeat this process
    for i in range(num_levels):
        drugs = np.array(targets_df.index.values)
        columns = targets_df.columns
        columns = [x for x in columns if x in sources]
        
        # Get index of drugs that target each protein
        protein_array = targets_df.apply(np.flatnonzero, axis=0)
        
        # Get proteins that are related to each other
        related_proteins = {protein: np.array([[targets[i], i] for i in protein_relationship_indexes[protein]
                                     if targets[i] in targets_df.columns]) for protein in columns}
        
        # Remove proteins that are not in the dataset or that are related to any other proteins
        proteins_to_remove = [x for x in protein_array.index.values if x not in columns or
                              len(related_proteins[x]) == 0]
        protein_array = protein_array.drop(proteins_to_remove, axis=0)
        
        # Remove proteins that aren't targeted by any drugs
        protein_array = protein_array[protein_array.apply(lambda x: len(x) > 0)]
        
        # Get the relation for each pair of related proteins
        protein_relationships = {protein: relations[np.uint64(related_proteins[protein][:,1])] 
                                 for protein in protein_array.index.values}
        
        # Get drug names for drugs targeting proteins
        drug_names = [drugs[x] for x in protein_array.values]
        drug_names = pd.Series(drug_names, index=protein_array.index.values)
        
        # Determine whether each target protein is upregulated or downregulated
        protein_array_values = [[targets_df.loc[drug, protein] for drug in drug_names[protein]] 
                                for protein in drug_names.index.values]
        protein_array_values = pd.Series(protein_array_values, index=protein_array.index.values)
        
        # Update target dataframe
        for protein in drug_names.index.values:
            current_related = related_proteins[protein][:,0]
            current_values = protein_array_values[protein]
            new_targets_df.loc[drug_names[protein], current_related] \
            = (np.tile(np.array(current_values).reshape((len(current_values),1)), (1, len(current_related)))\
                * protein_relationships[protein])
        
        # Save target data frames
        targets_df = new_targets_df.copy()
        targets_df.to_csv(r'../data/target_data_frames/random_networks/random_' + str(j) + '.csv')
        
        
        
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for j in tqdm_notebook(range(1,101)):


  0%|          | 0/100 [00:00<?, ?it/s]