In [1]:
### Generate a map of GO terms for each gene
def GO_map(GO_slim_map, PPI_all, GO_group):
    PPI_all_list = [] # Create a list of PPIs covered by PPiSeq
    PPI_all_file = open(PPI_all, 'r') 
    for line in PPI_all_file:
        PPI_all_list.append([x for x in line.strip().split(",")][0])
    del PPI_all_list[0]

    PPiSeq_PPI = PPI_all_list #### Create a list of PPIs that are covered by PPiSeq

    PPiSeq_gene = [] # all the genes covered by PPiSeq (good lineages)
    for PPI in PPiSeq_PPI:
        a = PPI.split("_")[0]
        PPiSeq_gene.append(a)
        b = PPI.split("_")[1]
        PPiSeq_gene.append(b)
    PPiSeq_gene = list(set(PPiSeq_gene))
    
    gene_GO = [] # input the GO_slim_map
    GO_map = open(GO_slim_map, 'r') 
    for line in GO_map:
        gene_GO.append([x for x in line.strip().split("\t")])
    
    # remove the meaningless GO terms
    bad_GO = ['cellular_component','biological_process', 'molecular_function', 'not_yet_annotated','other']    
    
    gene_GO_group = []
    for go in gene_GO:
        if go[3] == GO_group and go[4] not in bad_GO and go[0] in PPiSeq_gene: # Covered by PPiseq, MF or CC or BP, not bad
            gene_GO_group.append(go)

    gene_GO_key = [i[0] for i in gene_GO_group] # extract genes 
    gene_GO_value = [i[4] for i in gene_GO_group] # extract GO terms for each gene
    gene_GO_select = zip(gene_GO_key, gene_GO_value)
    # Make a dictionary (Key is the GO term, and value is the gene)
    gene_GO_dict = dict()
    for j in gene_GO_select:
        if j[0] in gene_GO_dict:
            gene_GO_dict[j[0]].append(j[1])
        else:
            gene_GO_dict[j[0]] = [j[1]]
    return(gene_GO_dict)

In [2]:
### Count protein-protein pairs for each GO_GO pair
def PPiSeq_network_all_count(PPI_all, gene_GO_dict, output_file_1):
    import itertools
    PPI_all_list = [] # Create a list of negative PPIs covered by PPiSeq
    PPI_all_file = open(PPI_all, 'r') 
    for line in PPI_all_file:
        PPI_all_list.append([x for x in line.strip().split(",")][0])
    del PPI_all_list[0]

    PPiSeq_PPI = PPI_all_list #### Create a list of PPIs that are covered by PPiSeq

    #PPiSeq_gene = [] # all the genes covered by PPiSeq (good lineages)
    protein_A = []  # ORF-DHFR[3]
    protein_B = []  # ORF-DHFR[1,2]
    for PPI in PPiSeq_PPI:
        a = PPI.split("_")[0]
        protein_A.append(a)
        #PPiSeq_gene.append(a)
        b = PPI.split("_")[1]
        #PPiSeq_gene.append(b)
        protein_B.append(b)
    
    GO_GO_dict = {}
    for i in range(len(protein_A)):
        pro_a = protein_A[i]
        pro_b = protein_B[i]
        if pro_a in gene_GO_dict.keys() and pro_b in gene_GO_dict.keys():
            GO_A = gene_GO_dict[pro_a]
            GO_B = gene_GO_dict[pro_b]
            GO_A_B = list(itertools.product(GO_A, GO_B))
            GO_B_A = list(itertools.product(GO_B, GO_A)) # Make two opposite directions the same
            GO_A_B_filter = [a for a in GO_A_B if a not in GO_B_A] # Remove one copy of the duplicated GO_GO combination
            GO_AB = GO_A_B_filter + GO_B_A
            for j in GO_AB:
                if j in GO_GO_dict:
                    GO_GO_dict[j].append(1)
                else:
                    GO_GO_dict[j] = [1]
    # Write the GO_term_1, GO_term_2, counts into an output file               
    GO_GO_matrix = list([['GO_1', 'GO_2', 'count']])
    
    for key, value in GO_GO_dict.items():
        GO_GO_matrix.append([key[0], key[1], str(len(value))])
    with open(output_file_1, 'w') as file:
        file.writelines('\t'.join(a) + '\n' for a in GO_GO_matrix)

In [3]:
### Count PPI number for each GO_GO pair
def PPiSeq_network_pos_count(PPI_pos, gene_GO_dict, output_file_2):
    import itertools
    PPI_pos_list = [] # Create a list of positive PPIs
    PPI_env = open(PPI_pos, 'r') 
    for line in PPI_env:
        PPI_pos_list.append([x for x in line.strip().split(",")][0])
    del PPI_pos_list[0]
    pos_A = []  # ORF-DHFR[3]
    pos_B = []  # ORF-DHFR[1,2]
    for PPI in PPI_pos_list:
        a = PPI.split("_")[0]
        pos_A.append(a)
        b = PPI.split("_")[1]
        pos_B.append(b)
        
    # put the positive PPIs into a dictionary of GO_GO
    GO_GO_pos_dict = {}
    for i in range(len(pos_A)):
        pos_a = pos_A[i]
        pos_b = pos_B[i]
        if pos_a in gene_GO_dict.keys() and pos_b in gene_GO_dict.keys():
            GO_pos_A = gene_GO_dict[pos_a]
            GO_pos_B = gene_GO_dict[pos_b]
            GO_pos_A_B = list(itertools.product(GO_pos_A, GO_pos_B))
            GO_pos_B_A = list(itertools.product(GO_pos_B, GO_pos_A))
            GO_pos_A_B_filter = [a for a in GO_pos_A_B if a not in GO_pos_B_A] # Remove one copy of the duplicated GO_GO combination
            GO_pos_AB = GO_pos_A_B_filter + GO_pos_B_A
            for j in GO_pos_AB:
                if j in GO_GO_pos_dict:
                    GO_GO_pos_dict[j].append(1)
                else:
                    GO_GO_pos_dict[j] = [1]
    
    GO_GO_pos_matrix = list([['GO_1', 'GO_2', 'count']])   
    for key, value in GO_GO_pos_dict.items():
        GO_GO_pos_matrix.append([key[0], key[1], str(len(value))])
    with open(output_file_2, 'w') as file:
        file.writelines('\t'.join(a) + '\n' for a in GO_GO_pos_matrix)

In [4]:
## Here for PPI_all, I should use deduplicated PPPs without merging two SD replicates. 
## Merged SD gave us better power to detect PPIs, but for a new environment we will not get replicate data.
## Therefore, our search sapce for PPIs is all PPI that have at least 2 replicates in any of 9 environments.
## To make a fair comparison, I should only keep one orientation for each PPP.
## That is protein A - protein B is the same with protein B - protein A
GO_slim_map = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Outside_datasets/GO_term_files/go_slim_mapping_tab_20190405.txt"
GO_group = "C"
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
PPI_all = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/All_PPP_for_GO_GO_map.csv"
output_file_1 = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/Network_all_count_PPI_CC_new.txt"
output_file_2 = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/Network_pos_count_PPI_CC_new.txt"

In [5]:
gene_GO_dict = GO_map(GO_slim_map, PPI_all, GO_group)
PPiSeq_network_all_count(PPI_all, gene_GO_dict,output_file_1)
PPiSeq_network_pos_count(PPI_pos, gene_GO_dict,output_file_2)

In [6]:
GO_slim_map = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Outside_datasets/GO_term_files/go_slim_mapping_tab_20190405.txt"
GO_group = "P"
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
PPI_all = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/All_PPP_for_GO_GO_map.csv"
output_file_1 = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/Network_all_count_PPI_BP_new.txt"
output_file_2 = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/Network_pos_count_PPI_BP_new.txt"

In [7]:
gene_GO_dict = GO_map(GO_slim_map, PPI_all, GO_group)
PPiSeq_network_all_count(PPI_all, gene_GO_dict,output_file_1)
PPiSeq_network_pos_count(PPI_pos, gene_GO_dict,output_file_2)

In [8]:
GO_slim_map = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Outside_datasets/GO_term_files/go_slim_mapping_tab_20190405.txt"
GO_group = "F"
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
PPI_all = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/All_PPP_for_GO_GO_map.csv"
output_file_1 = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/Network_all_count_PPI_MF_new.txt"
output_file_2 = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/Network_pos_count_PPI_MF_new.txt"

In [9]:
gene_GO_dict = GO_map(GO_slim_map, PPI_all, GO_group)
PPiSeq_network_all_count(PPI_all, gene_GO_dict,output_file_1)
PPiSeq_network_pos_count(PPI_pos, gene_GO_dict,output_file_2)

#### Create 1000 random network based on the degree of each protein of PPI

In [10]:
## Create similar network with the degree distribution of network of a specific environment
### Define a function to write a random network with the same number of positive interactions
### based on the gene list and positive PPIs in the PPiseq data (all_environments)
### The idea will be replace all one protein with the other protein, keep the distribution of degree
def PPiSeq_random_network(PPiSeq_all_PPI, PPI_pos, number_random, output_file):

    PPiSeq_PPI = [] # Create a list of negative PPIs covered by PPiSeq
    PPI_file = open(PPiSeq_all_PPI, 'r') 
    for line in PPI_file:
        PPiSeq_PPI.append([x for x in line.strip().split(",")][0])
    del PPiSeq_PPI[0]

    PPiSeq_gene = [] # all the genes covered by PPiSeq (good lineages)
    for PPI in PPiSeq_PPI:
        a = PPI.split("_")[0]
        PPiSeq_gene.append(a)
        b = PPI.split("_")[1]
        PPiSeq_gene.append(b)
    PPiSeq_gene = list(set(PPiSeq_gene))
    #Remove control genes and ORFXDHFR fragment strains
    import re
    fragment = filter(lambda x: re.search(r'HO', x), PPiSeq_gene)# Remove the fragments from gene list
    PPiSeq_gene= [x for x in PPiSeq_gene if x not in fragment]
    
    PPI_pos_list = [] # Create a list of positive PPIs
    PPI_env = open(PPI_pos, 'r') 
    for line in PPI_env:
        PPI_data = [x for x in line.strip().split(",")]
        PPI_pos_list.append(PPI_data[0])
    del PPI_pos_list[0]
            
    protein_A = []  # ORF-DHFR[3]
    protein_B = []  # ORF-DHFR[1,2]
    for PPI in PPI_pos_list:
        a = PPI.split("_")[0]
        protein_A.append(a)
        b = PPI.split("_")[1]
        protein_B.append(b)
    
    import random
    # reorder the PPiSeq_gene, and then use it to replace Protein_A and Protein_B with the index in the PPiSeq_gene
    for m in range(number_random):
        bait = list(range(len(protein_A)))
        prey = list(range(len(protein_B)))
        all_reorder = random.sample(PPiSeq_gene, len(PPiSeq_gene))
        for n in range(len(bait)):
            for index_1, protein_1 in enumerate(PPiSeq_gene):
                if protein_1 == protein_A[n]:
                    bait[n] = all_reorder[index_1]
        for o in range(len(prey)):
            for index_2, protein_2 in enumerate(PPiSeq_gene):
                if protein_2 == protein_B[o]:
                    prey[o] = all_reorder[index_2]
        sample_PPI = []
        for bait, prey in zip(bait, prey):
            sample_PPI.append(bait + "_" + prey)
        with open(output_file + str(m + 1) + ".csv", 'w') as output:
            output.writelines(a + '\n' for a in sample_PPI)

In [11]:
PPiSeq_all_PPI = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/All_PPP_for_GO_GO_map.csv"
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
number_random = 1000
output_file = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/random_network/random_network_all_"
PPiSeq_random_network(PPiSeq_all_PPI, PPI_pos, number_random, output_file)

#### Get the positive PPI count for 1000 random networks

In [12]:
PPI_random_front = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/random_network/random_network_all_"
GO_slim_map = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Outside_datasets/GO_term_files/go_slim_mapping_tab_20190405.txt"
output_file_C_front = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/random_network/random_network_density/random_network_CC_pos_count_"
output_file_P_front = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/random_network/random_network_density/random_network_BP_pos_count_"
output_file_F_front = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/random_network/random_network_density/random_network_MF_pos_count_"
gene_GO_dict_C = GO_map(GO_slim_map, PPI_all, "C")
gene_GO_dict_P = GO_map(GO_slim_map, PPI_all, "P")
gene_GO_dict_F = GO_map(GO_slim_map, PPI_all, "F")
for i in range(1000):
    index = i + 1
    PPI_random = PPI_random_front + str(index) + ".csv"
    output_file_C = output_file_C_front + str(index) + ".txt"
    output_file_P = output_file_P_front + str(index) + ".txt"
    output_file_F = output_file_F_front + str(index) + ".txt"
    PPiSeq_network_pos_count(PPI_random, gene_GO_dict_C,output_file_C)
    PPiSeq_network_pos_count(PPI_random, gene_GO_dict_P,output_file_P)
    PPiSeq_network_pos_count(PPI_random, gene_GO_dict_F,output_file_F)

## Make same heatmap for each environment

In [13]:
### Generate a map of GO terms for each gene
def GO_map(GO_slim_map, PPI_all, GO_group):
    PPI_all_list = [] # Create a list of PPIs covered by PPiSeq
    PPI_all_file = open(PPI_all, 'r') 
    for line in PPI_all_file:
        PPI_all_list.append([x for x in line.strip().split(",")][0])
    del PPI_all_list[0]

    PPiSeq_PPI = PPI_all_list #### Create a list of PPIs that are covered by PPiSeq

    PPiSeq_gene = [] # all the genes covered by PPiSeq (good lineages)
    for PPI in PPiSeq_PPI:
        a = PPI.split("_")[0]
        PPiSeq_gene.append(a)
        b = PPI.split("_")[1]
        PPiSeq_gene.append(b)
    PPiSeq_gene = list(set(PPiSeq_gene))
    
    gene_GO = [] # input the GO_slim_map
    GO_map = open(GO_slim_map, 'r') 
    for line in GO_map:
        gene_GO.append([x for x in line.strip().split("\t")])
    
    # remove the meaningless GO terms
    bad_GO = ['cellular_component','biological_process', 'molecular_function', 'not_yet_annotated','other']    
    
    gene_GO_group = []
    for go in gene_GO:
        if go[3] == GO_group and go[4] not in bad_GO and go[0] in PPiSeq_gene: # Covered by PPiseq, MF or CC or BP, not bad
            gene_GO_group.append(go)

    gene_GO_key = [i[0] for i in gene_GO_group] # extract genes 
    gene_GO_value = [i[4] for i in gene_GO_group] # extract GO terms for each gene
    gene_GO_select = zip(gene_GO_key, gene_GO_value)
    # Make a dictionary (Key is the GO term, and value is the gene)
    gene_GO_dict = dict()
    for j in gene_GO_select:
        if j[0] in gene_GO_dict:
            gene_GO_dict[j[0]].append(j[1])
        else:
            gene_GO_dict[j[0]] = [j[1]]
    return(gene_GO_dict)

In [14]:
### Count protein-protein pairs for each GO_GO pair
def PPiSeq_network_all_count(PPI_all, gene_GO_dict, output_file_1):
    import itertools
    PPI_all_list = [] # Create a list of negative PPIs covered by PPiSeq
    PPI_all_file = open(PPI_all, 'r') 
    for line in PPI_all_file:
        PPI_all_list.append([x for x in line.strip().split(",")][0])
    del PPI_all_list[0]

    PPiSeq_PPI = PPI_all_list #### Create a list of PPIs that are covered by PPiSeq

    #PPiSeq_gene = [] # all the genes covered by PPiSeq (good lineages)
    protein_A = []  # ORF-DHFR[3]
    protein_B = []  # ORF-DHFR[1,2]
    for PPI in PPiSeq_PPI:
        a = PPI.split("_")[0]
        protein_A.append(a)
        #PPiSeq_gene.append(a)
        b = PPI.split("_")[1]
        #PPiSeq_gene.append(b)
        protein_B.append(b)
    
    GO_GO_dict = {}
    for i in range(len(protein_A)):
        pro_a = protein_A[i]
        pro_b = protein_B[i]
        if pro_a in gene_GO_dict.keys() and pro_b in gene_GO_dict.keys():
            GO_A = gene_GO_dict[pro_a]
            GO_B = gene_GO_dict[pro_b]
            GO_A_B = list(itertools.product(GO_A, GO_B))
            GO_B_A = list(itertools.product(GO_B, GO_A)) # Make two opposite directions the same
            GO_A_B_filter = [a for a in GO_A_B if a not in GO_B_A] # Remove one copy of the duplicated GO_GO combination
            GO_AB = GO_A_B_filter + GO_B_A
            for j in GO_AB:
                if j in GO_GO_dict:
                    GO_GO_dict[j].append(1)
                else:
                    GO_GO_dict[j] = [1]
    # Write the GO_term_1, GO_term_2, counts into an output file               
    GO_GO_matrix = list([['GO_1', 'GO_2', 'count']])
    
    for key, value in GO_GO_dict.items():
        GO_GO_matrix.append([key[0], key[1], str(len(value))])
    with open(output_file_1, 'w') as file:
        file.writelines('\t'.join(a) + '\n' for a in GO_GO_matrix)

In [15]:
### Count PPI number for each GO_GO pair
def PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict, output_file_2):
    import itertools
    PPI_pos_list = [] # Create a list of positive PPIs
    PPI_env = open(PPI_pos, 'r') 
    for line in PPI_env:
        PPI_data = [x for x in line.strip().split(",")]
        if PPI_data[environment-1] == "1":
            PPI_pos_list.append(PPI_data[0])
    pos_A = []  # ORF-DHFR[3]
    pos_B = []  # ORF-DHFR[1,2]
    for PPI in PPI_pos_list:
        a = PPI.split("_")[0]
        pos_A.append(a)
        b = PPI.split("_")[1]
        pos_B.append(b)
        
    # put the positive PPIs into a dictionary of GO_GO
    GO_GO_pos_dict = {}
    for i in range(len(pos_A)):
        pos_a = pos_A[i]
        pos_b = pos_B[i]
        if pos_a in gene_GO_dict.keys() and pos_b in gene_GO_dict.keys():
            GO_pos_A = gene_GO_dict[pos_a]
            GO_pos_B = gene_GO_dict[pos_b]
            GO_pos_A_B = list(itertools.product(GO_pos_A, GO_pos_B))
            GO_pos_B_A = list(itertools.product(GO_pos_B, GO_pos_A))
            GO_pos_A_B_filter = [a for a in GO_pos_A_B if a not in GO_pos_B_A] # Remove one copy of the duplicated GO_GO combination
            GO_pos_AB = GO_pos_A_B_filter + GO_pos_B_A
            for j in GO_pos_AB:
                if j in GO_GO_pos_dict:
                    GO_GO_pos_dict[j].append(1)
                else:
                    GO_GO_pos_dict[j] = [1]
    
    GO_GO_pos_matrix = list([['GO_1', 'GO_2', 'count']])   
    for key, value in GO_GO_pos_dict.items():
        GO_GO_pos_matrix.append([key[0], key[1], str(len(value))])
    with open(output_file_2, 'w') as file:
        file.writelines('\t'.join(a) + '\n' for a in GO_GO_pos_matrix)

In [16]:
### Count PPI number for each GO_GO pair
def PPiSeq_network_pos_count_random(PPI_pos, gene_GO_dict, output_file_2):
    import itertools
    PPI_pos_list = [] # Create a list of positive PPIs
    PPI_env = open(PPI_pos, 'r') 
    for line in PPI_env:
        PPI_pos_list.append([x for x in line.strip().split(",")][0])
    del PPI_pos_list[0]
    pos_A = []  # ORF-DHFR[3]
    pos_B = []  # ORF-DHFR[1,2]
    for PPI in PPI_pos_list:
        a = PPI.split("_")[0]
        pos_A.append(a)
        b = PPI.split("_")[1]
        pos_B.append(b)
        
    # put the positive PPIs into a dictionary of GO_GO
    GO_GO_pos_dict = {}
    for i in range(len(pos_A)):
        pos_a = pos_A[i]
        pos_b = pos_B[i]
        if pos_a in gene_GO_dict.keys() and pos_b in gene_GO_dict.keys():
            GO_pos_A = gene_GO_dict[pos_a]
            GO_pos_B = gene_GO_dict[pos_b]
            GO_pos_A_B = list(itertools.product(GO_pos_A, GO_pos_B))
            GO_pos_B_A = list(itertools.product(GO_pos_B, GO_pos_A))
            GO_pos_A_B_filter = [a for a in GO_pos_A_B if a not in GO_pos_B_A] # Remove one copy of the duplicated GO_GO combination
            GO_pos_AB = GO_pos_A_B_filter + GO_pos_B_A
            for j in GO_pos_AB:
                if j in GO_GO_pos_dict:
                    GO_GO_pos_dict[j].append(1)
                else:
                    GO_GO_pos_dict[j] = [1]
    
    GO_GO_pos_matrix = list([['GO_1', 'GO_2', 'count']])   
    for key, value in GO_GO_pos_dict.items():
        GO_GO_pos_matrix.append([key[0], key[1], str(len(value))])
    with open(output_file_2, 'w') as file:
        file.writelines('\t'.join(a) + '\n' for a in GO_GO_pos_matrix)

In [17]:
## Create similar network with the degree distribution of network of a specific environment
### Define a function to write a random network with the same number of positive interactions
### based on the gene list and positive PPIs in the PPiseq data (all_environments)
### The idea will be replace all one protein with the other protein, keep the distribution of degree
def PPiSeq_random_network(PPiSeq_all_PPI, PPI_pos, environment, number_random, output_file):

    PPiSeq_PPI = [] # Create a list of negative PPIs covered by PPiSeq
    PPI_file = open(PPiSeq_all_PPI, 'r') 
    for line in PPI_file:
        PPiSeq_PPI.append([x for x in line.strip().split(",")][0])
    del PPiSeq_PPI[0]

    PPiSeq_gene = [] # all the genes covered by PPiSeq (good lineages)
    for PPI in PPiSeq_PPI:
        a = PPI.split("_")[0]
        PPiSeq_gene.append(a)
        b = PPI.split("_")[1]
        PPiSeq_gene.append(b)
    PPiSeq_gene = list(set(PPiSeq_gene))
    #Remove control genes and ORFXDHFR fragment strains
    import re
    fragment = filter(lambda x: re.search(r'HO', x), PPiSeq_gene)# Remove the fragments from gene list
    PPiSeq_gene= [x for x in PPiSeq_gene if x not in fragment]
    
    PPI_pos_list = [] # Create a list of positive PPIs
    PPI_env = open(PPI_pos, 'r') 
    for line in PPI_env:
        PPI_data = [x for x in line.strip().split(",")]
        if PPI_data[environment-1] == "1":
            PPI_pos_list.append(PPI_data[0])
            
    protein_A = []  # ORF-DHFR[3]
    protein_B = []  # ORF-DHFR[1,2]
    for PPI in PPI_pos_list:
        a = PPI.split("_")[0]
        protein_A.append(a)
        b = PPI.split("_")[1]
        protein_B.append(b)
    
    import random
    # reorder the PPiSeq_gene, and then use it to replace Protein_A and Protein_B with the index in the PPiSeq_gene
    for m in range(number_random):
        bait = list(range(len(protein_A)))
        prey = list(range(len(protein_B)))
        all_reorder = random.sample(PPiSeq_gene, len(PPiSeq_gene))
        for n in range(len(bait)):
            for index_1, protein_1 in enumerate(PPiSeq_gene):
                if protein_1 == protein_A[n]:
                    bait[n] = all_reorder[index_1]
        for o in range(len(prey)):
            for index_2, protein_2 in enumerate(PPiSeq_gene):
                if protein_2 == protein_B[o]:
                    prey[o] = all_reorder[index_2]
        sample_PPI = []
        for bait, prey in zip(bait, prey):
            sample_PPI.append(bait + "_" + prey)
        with open(output_file + str(m + 1) + ".csv", 'w') as output:
            output.writelines(a + '\n' for a in sample_PPI)

In [18]:
# Create a map for cellular compartment
GO_group = "C"
PPI_all = "/Users/Zhimin/Dropbox/PPiSeq_02/paper_data/Useful_datasets/All_PPP_for_GO_GO_map.csv"
GO_slim_map = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Outside_datasets/GO_term_files/go_slim_mapping_tab_20190405.txt"
gene_GO_dict_C = GO_map(GO_slim_map, PPI_all, GO_group)

# Create a map for biological process
GO_group = "P"
gene_GO_dict_P = GO_map(GO_slim_map, PPI_all, GO_group)

# Create a map for molecular funcition
GO_group = "F"
gene_GO_dict_F = GO_map(GO_slim_map, PPI_all, GO_group)

# DMSO

In [19]:
# For each environment count number of PPIs for each GO_GO pair
# environment: column number of different environments
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:12
environment = 3 # DMSO
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/DMSO/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/DMSO/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/DMSO/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# H2O2

In [22]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:11
environment = 4 # H2O2
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/H2O2/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/H2O2/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/H2O2/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# Hydroxyurea

In [23]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:12
environment = 5 # HU
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/HU/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/HU/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/HU/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# Doxorubicin

In [24]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:12
environment = 6 # Dox
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Dox/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Dox/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Dox/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# Forskolin

In [25]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:12
environment = 7 # Forskolin
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Forskolin/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Forskolin/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Forskolin/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# Raffinose

In [26]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:12
environment = 8 # Raffinose
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Raffinose/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Raffinose/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/Raffinose/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# NaCl

In [27]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:11
environment = 9 # NaCl
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/NaCl/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/NaCl/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/NaCl/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# 16C

In [28]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:11
environment = 10 # 16C
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/16C/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/16C/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/16C/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)

# FK506

In [29]:
# DMSO:3, H2O2:4, HU:5, Dox:6, Forskolin: 7, Raffinose: 8, NaCl: 9, 16C: 10, FK506:11
environment = 11 # FK506
PPI_pos = "/Users/Zhimin/Dropbox/PPiSeq_02/Paper_data/Useful_datasets/PPI_environment_count_summary_SD_merge_filter.csv"
output_file_C = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/FK506/Network_pos_count_PPI_CC_new.txt"
output_file_P = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/FK506/Network_pos_count_PPI_BP_new.txt"
output_file_F = "/Users/Zhimin/Dropbox/PPiSeq_02/Working_data_2/PPI_pair_GO/environment/FK506/Network_pos_count_PPI_MF_new.txt"
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_C,output_file_C) 
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_P,output_file_P)
PPiSeq_network_pos_count(PPI_pos, environment, gene_GO_dict_F,output_file_F)