In [1]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import pickle as pk
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

In [2]:
create_term_mapping_list("data/go_labels/dream1_go.tsv", "data/go_labels/d1_term_mapping.tsv")
create_term_mapping_list("data/go_labels/dream2_go.tsv", "data/go_labels/d2_term_mapping.tsv")
create_term_mapping_list("data/go_labels/dream3_go.tsv", "data/go_labels/d3_term_mapping.tsv")
background_go_terms = vocabs.getTerms(['GO'])


In [3]:
results = pd.DataFrame({'label': [], '3_sqrt': [], '3_half': [], '3_quarter': [], '3_tenth': []})

In [4]:
for dream in {"3", "2", "1"}:
    # occurs for a single dream:
    term_mapping_filepath = "data/go_labels/d" + dream + "_term_mapping.tsv"

    for spectral in {"ic3", "ic10", "ic50", "ic100"}:
        # unpickle:
        data_string = "d" + dream + "_" + spectral 
        pickle_file = "data/pickles/" + data_string + ".pickle"
        picke_dict = dict()
        with open(pickle_file, 'rb') as f: # must be rb for "read binary"
            pickle_dict = pk.load(f)

        # print background proteins to file:
        background_proteins_filepath = "data/FE/" + data_string + "/protein_background.txt"
        # print_protein_background_to_file(pickle_dict["matrix"], background_proteins_filepath)

        # next steps: sort clusters into bins based on size, then do functional enrichment on each bin
        clusters = AllClusters(cluster_dict=pickle_dict["clusters_dict"])
        
        # first bin: 3-7, second bin: 8-15, third bin: 16-31, fourth bin: 32-63, fifth bin: 64-100, overall bin: 3-100
        for bin_label, min, max in {("bin1", 3, 8), ("bin2", 9, 16), ("bin3", 17, 32), ("bin4", 33, 64), ("bin5", 65, 100), ("allBins", 3, 100)}:
            bin_label = data_string + "/three_prots_" + bin_label
            
            # # original functional enrichment:
            # og_query_filepath = "data/FE/" + bin_label + "_original_clusters.tsv"
            # print_querylist_of_clusters_to_file(clusters, clusters_to_print=(clusters.filter_clusters_by_size(min, max).keys()), query_filepath=og_query_filepath)
            # original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
            # original_fe.importFiles({'query': og_query_filepath })
            # original_fe.run(cluster=False)

            # sqrt functional enrichment:
            try:
                sqrt_query_filepath = "data/FE/" + bin_label + "_sqrt_clusters.tsv"
                sqrt_proteins_to_add = {key: pickle_dict["sqrt_proteins"][key][:3] for key in (pickle_dict["sqrt_proteins"]).keys() if key in clusters.filter_clusters_by_size(min, max).keys()}
                print_querylist_of_clusters_to_file(clusters, proteins_to_add=sqrt_proteins_to_add, clusters_to_print=(clusters.filter_clusters_by_size(min, max).keys()), query_filepath=sqrt_query_filepath)
                sqrt_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
                sqrt_fe.importFiles({'query': sqrt_query_filepath })
                sqrt_fe.run(cluster=False)
                
                # half functional enrichment:
                half_query_filepath = "data/FE/" + bin_label + "_half_clusters.tsv"
                half_proteins_to_add = {key: pickle_dict["half_proteins"][key][:3] for key in (pickle_dict["half_proteins"]).keys() if key in clusters.filter_clusters_by_size(min, max).keys()}
                print_querylist_of_clusters_to_file(clusters, proteins_to_add=half_proteins_to_add, clusters_to_print=(clusters.filter_clusters_by_size(min, max).keys()), query_filepath=half_query_filepath)
                half_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
                half_fe.importFiles({'query': half_query_filepath })
                half_fe.run(cluster=False)

                # quarter functional enrichment:
                quarter_query_filepath = "data/FE/" + bin_label + "_quarter_clusters.tsv"
                quarter_proteins_to_add = {key: pickle_dict["quarter_proteins"][key][:3] for key in (pickle_dict["quarter_proteins"]).keys() if key in clusters.filter_clusters_by_size(min, max).keys()}
                print_querylist_of_clusters_to_file(clusters, proteins_to_add=quarter_proteins_to_add, clusters_to_print=(clusters.filter_clusters_by_size(min, max).keys()), query_filepath=quarter_query_filepath)
                quarter_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
                quarter_fe.importFiles({'query': quarter_query_filepath })
                quarter_fe.run(cluster=False)

                # tenth functional enrichment:
                tenth_query_filepath = "data/FE/" + bin_label + "_tenth_clusters.tsv"
                tenth_proteins_to_add = {key: pickle_dict["tenth_proteins"][key][:3] for key in (pickle_dict["tenth_proteins"]).keys() if key in clusters.filter_clusters_by_size(min, max).keys()}
                print_querylist_of_clusters_to_file(clusters, proteins_to_add=tenth_proteins_to_add, clusters_to_print=(clusters.filter_clusters_by_size(min, max).keys()), query_filepath=tenth_query_filepath)
                tenth_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
                tenth_fe.importFiles({'query': tenth_query_filepath })
                tenth_fe.run(cluster=False)
            
                # add FE results to results dataframe
                print(f"label: {bin_label}, 'sqrt': {sqrt_fe.enrichment['Module'].nunique()}, 'half': {half_fe.enrichment['Module'].nunique()}, 'quarter': {quarter_fe.enrichment['Module'].nunique()}, 'tenth': {tenth_fe.enrichment['Module'].nunique()}")
                results = results.append({'label': bin_label, '3_sqrt': sqrt_fe.enrichment['Module'].nunique(), '3_half': half_fe.enrichment['Module'].nunique(), '3_quarter': quarter_fe.enrichment['Module'].nunique(), '3_tenth': tenth_fe.enrichment['Module'].nunique()}, ignore_index=True)
            except:
                print("smth failed for " + bin_label)
        

label: d3_ic50/three_prots_bin5, 'sqrt': 31, 'half': 31, 'quarter': 31, 'tenth': 32
label: d3_ic50/three_prots_bin4, 'sqrt': 31, 'half': 31, 'quarter': 31, 'tenth': 32
label: d3_ic50/three_prots_bin1, 'sqrt': 1, 'half': 1, 'quarter': 1, 'tenth': 1
label: d3_ic50/three_prots_bin3, 'sqrt': 20, 'half': 20, 'quarter': 20, 'tenth': 20
label: d3_ic50/three_prots_allBins, 'sqrt': 89, 'half': 89, 'quarter': 89, 'tenth': 91
label: d3_ic50/three_prots_bin2, 'sqrt': 6, 'half': 6, 'quarter': 6, 'tenth': 6
label: d3_ic3/three_prots_bin5, 'sqrt': 31, 'half': 31, 'quarter': 31, 'tenth': 30
label: d3_ic3/three_prots_bin4, 'sqrt': 30, 'half': 30, 'quarter': 30, 'tenth': 30
label: d3_ic3/three_prots_bin1, 'sqrt': 3, 'half': 3, 'quarter': 3, 'tenth': 3
label: d3_ic3/three_prots_bin3, 'sqrt': 20, 'half': 20, 'quarter': 20, 'tenth': 21
label: d3_ic3/three_prots_allBins, 'sqrt': 88, 'half': 88, 'quarter': 88, 'tenth': 89
label: d3_ic3/three_prots_bin2, 'sqrt': 4, 'half': 4, 'quarter': 4, 'tenth': 5
label: d

In [6]:
results

Unnamed: 0,label,3_sqrt,3_half,3_quarter,3_tenth,q3_uarter
0,d3_ic50/three_prots_bin5,31.0,31.0,,32.0,31.0
1,d3_ic50/three_prots_bin4,31.0,31.0,,32.0,31.0
2,d3_ic50/three_prots_bin1,1.0,1.0,,1.0,1.0
3,d3_ic50/three_prots_bin3,20.0,20.0,,20.0,20.0
4,d3_ic50/three_prots_allBins,89.0,89.0,,91.0,89.0
...,...,...,...,...,...,...
66,d1_ic100/three_prots_bin4,121.0,121.0,,126.0,121.0
67,d1_ic100/three_prots_bin1,4.0,4.0,,4.0,4.0
68,d1_ic100/three_prots_bin3,64.0,64.0,,68.0,64.0
69,d1_ic100/three_prots_allBins,311.0,311.0,,321.0,312.0


In [7]:
results = results.sort_values('label').reset_index(drop=True)
results.to_csv('data/FE/three_protein_results.tsv', index=False, sep='\t')
