
# recipe_ic50_medium_clusters
- this file is for dream1 data, with medium clusters generated using the ic_50 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [6]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

In [7]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath="data/interactions/1_ppi_string_v2.txt", clusters_filepath="data/ic_50_medium/dream1_medium-cluster.json")

In [8]:
# linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)
# linear_qualifying_proteins

In [9]:
# sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)
# sqrt_qualifying_proteins

In [10]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)
# 1) a background protein list to be used for functional enrichment analysis
background_proteins_filepath = "data/ic_50_medium/background_proteinlist.txt"
print_protein_background_to_file(matrix, background_proteins_filepath) 

# 2) file containing protein to go term
term_mapping_filepath = "data/go_labels/dream1_term_mapping.txt"
create_term_mapping_list("data/go_labels/dream1_go.tsv", term_mapping_filepath)

# 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
background_go_terms = vocabs.getTerms(['GO'])

# continue printing info to files to be used in functional enrichment analysis:

# 4) print original clusters to a file
og_query_filepath = 'data/ic_50_medium/original_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), og_query_filepath)

# # 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
# sqrt_query_filepath = 'data/ic_50_medium/sqrt_clusters.txt'
# print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), sqrt_query_filepath, sqrt_qualifying_proteins)

# linear_query_filepath = 'data/ic_50_medium/linear_clusters.txt'
# print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), linear_query_filepath, linear_qualifying_proteins)

In [19]:
background_go_terms

Unnamed: 0,Vocabulary,Term,Name
0,GO,GO:0000001,mitochondrion inheritance
1,GO,GO:0000002,mitochondrial genome maintenance
2,GO,GO:0000003,reproduction
3,GO,GO:0000005,obsolete ribosomal chaperone activity
4,GO,GO:0000006,high-affinity zinc transmembrane transporter a...
...,...,...,...
47491,GO,GO:2001312,lysobisphosphatidic acid biosynthetic process
47492,GO,GO:2001313,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...
47493,GO,GO:2001314,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...
47494,GO,GO:2001315,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...


In [22]:
# do functional enrichment on OG clusters!!
original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms, ecut=0.5)
original_fe.importFiles({'query': og_query_filepath })
original_fe.run(cluster=False)



In [16]:

# do functional enrichment on updated clusters!!

# sqrt_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
# sqrt_fe_all_proteins.importFiles({'query': sqrt_query_filepath })
# sqrt_fe_all_proteins.run(cluster=False)


# linear_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
# linear_fe_all_proteins.importFiles({'query': linear_query_filepath })
# linear_fe_all_proteins.run(cluster=False)



In [23]:
print(f"::::::::::::::::::::::::::SQRT::::::::::::::::::::::::::\n")


# # print data by the numbers!
# print(f"number of clusters that were updated with proteins: {len(sqrt_qualifying_proteins.keys())}\n")

# # clusters that are functionally enriched
# print(f"number of functionally enriched clusters (modules) out of {len(sqrt_qualifying_proteins.keys())} original clusters: {original_fe.enrichment['Module'].nunique()}")
# print(f"number of functionally enriched clusters after addition of all proteins: {sqrt_fe_all_proteins.enrichment['Module'].nunique()}")

# print(f"")


# significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
# print(f"number of significant pvals in clusters with all proteins added: {sqrt_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")


# print(f"::::::::::::::::::::::::::LINEAR::::::::::::::::::::::::::\n")


# # print data by the numbers!
# print(f"number of clusters that were updated with proteins: {len(linear_qualifying_proteins.keys())}\n")

# # clusters that are functionally enriched
# print(f"number of functionally enriched clusters (modules) out of {len(linear_qualifying_proteins.keys())} original clusters: {original_fe.enrichment['Module'].nunique()}")
# print(f"number of functionally enriched clusters after addition of all proteins: {linear_fe_all_proteins.enrichment['Module'].nunique()}")

# print(f"")

# # significant P-values!!!
# print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
# print(f"number of significant pvals in clusters with all proteins added: {linear_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")


::::::::::::::::::::::::::SQRT::::::::::::::::::::::::::

number of significant pvals in original clusters before updating: 0
