# Protein Pathways

This notebook generates sets of random pathways and adjusts the ChemPert target data frame from the chempert_preprocessing notebook

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook, tqdm
from matplotlib import pyplot as plt

# Get Protein Pathway Databases

### Pathways

In [4]:
# Pathway data set
#https://github.com/pathwayforte/results/blob/master/input/gmt_files/merged_geneset_final.gmt
kegg_df = pd.read_csv('../data/Protein_pathways/kegg_data.csv', sep=',', header=None, low_memory=False)
kegg_df = kegg_df.drop([0,1], axis=1)
pathway_lengths = [] 

# Remove pathways that are too long or short
for i in tqdm(kegg_df.index.values):
    current_pathway = kegg_df.loc[i]
    current_pathway = [x for x in current_pathway if not pd.isna(x)]
    if len(current_pathway) > 300 or len(current_pathway) < 15:
        kegg_df = kegg_df.drop(i, axis=0)
    else:
        pathway_lengths.append(len(current_pathway))
kegg_df = kegg_df.dropna(axis=1, how='all')       

100%|███████████████████████████████████████| 2896/2896 [01:32<00:00, 31.46it/s]


In [5]:
# Get all genes in pathways
all_genes = kegg_df.to_numpy().flatten()
all_genes = [x for x in all_genes if not pd.isna(x)]

In [23]:
# Create dictionary
import random
random_pathways = {}

# get sets of random pathways
for i in tqdm(range(0,1000)):
    random.shuffle(all_genes)
    pathways = [all_genes[sum(pathway_lengths[:i-1]):sum(pathway_lengths[:i])] 
               for i in range(1,len(pathway_lengths))]
    pathways.append(all_genes[sum(pathway_lengths[:-1]):sum(pathway_lengths)])

    current_pathways = pd.DataFrame(pathways)
    random_pathways[i] = current_pathways

100%|███████████████████████████████████████| 1000/1000 [01:30<00:00, 11.07it/s]


# Get Chempert data

In [10]:
#Get transcriptomic data frame
transcriptomic_responses_df = pd.read_csv(
    '../data/Transcriptional_data_frames/transcriptional_response_vectors.tsv',
    sep='\t',
    index_col=0,
)
transcriptomic_responses_df.head(4)
transcriptomic_responses_df.shape

(2152, 4938)

In [11]:
#Get targets data frame
targets_df = pd.read_csv(
    '../data/target_data_frames/target_vectors_Chempert.tsv',
    sep='\t',
    index_col=0,
)
targets_df.head(4)

Unnamed: 0,AAAS,AADAC,AADACL2,AADAT,AASS,AATF,ABAT,ABCA1,ABCB1,ABCB11,...,ZSCAN32,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSCAN5DP,ZSCAN9,ZXDA,ZXDB,ZXDC,ZZZ3
CID00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID00007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Add Protein Pathways to Target Data Frame

### Adjust targets_df based on KEGG protein pathways

In [12]:
kegg_dict = {}
for protein in tqdm(targets_df.columns):
    kegg_dict[protein] = kegg_df[kegg_df.isin([protein]).any(axis=1)].to_numpy()

100%|██████████████████████████████████████| 4938/4938 [00:47<00:00, 105.00it/s]


In [13]:
for j in tqdm_notebook(range(1000)):
    
    #Set initial variables
    current_pathways = random_pathways[j]
    new_targets_df = targets_df.copy()
    current_dict = {protein: current_pathways[current_pathways.isin([protein]).any(axis=1)].to_numpy()
                   for protein in targets_df.columns}
    
    # Get list of targeted proteins
    nonzero_vals = new_targets_df.apply(np.flatnonzero, axis=1)
    columns = targets_df.columns
    nonzero_vals = nonzero_vals.apply(lambda x: np.array(x))
    nonzero_vals = nonzero_vals.apply(lambda x: list(columns[x]))

    # Get related proteins
    related_proteins = nonzero_vals.apply(lambda x: [current_dict[protein] for protein in x])
    related_proteins = related_proteins.apply(lambda x: [related.flatten() for related in x])
    related_proteins = related_proteins.apply(lambda x: [related[~(pd.isna(related))] for related in x])
    related_proteins = related_proteins.apply(lambda x: [list(set(related)) for related in x])

    # Create new target data frame
    for drug in related_proteins.index.values:
        for i,protein in enumerate(nonzero_vals[drug]):
            gene_list = related_proteins[drug][i]
            gene_list = list(set(gene_list).intersection(targets_df.columns))
            test = np.array(new_targets_df.loc[drug, gene_list] == 0)
            gene_list = np.array(gene_list)[test]
            new_targets_df.loc[drug, gene_list] = np.ones(len(gene_list))*targets_df.loc[drug, protein] 
    
    new_targets_df.to_csv(r'../test_data/target_data_frames/pathway_permutations/kegg_pathways_' + str(j) + '.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for j in tqdm_notebook(range(1000)):


  0%|          | 0/1000 [00:00<?, ?it/s]