
# d1_ic10_medium
- this file is for dream1 data, with medium clusters generated using the ic_10 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [None]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

 **must change filenames:**

In [None]:
interactions_file = "data/interactions/1_ppi_string_v2.txt"
clusters_file = "data/d1_ic10_medium/dream1_medium-cluster.json"

In [None]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath=interactions_file, clusters_filepath=clusters_file)

In [None]:
linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)

In [None]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)

In [None]:
three_sqrt_qualifying_proteins = dict()
for key in sqrt_qualifying_proteins:
    three_sqrt_qualifying_proteins[key] = sqrt_qualifying_proteins[key][0:3]

In [1]:
from recipe_utils import get_cluster_connectivity

In [None]:
sorted_cluster_connectivity = get_cluster_connectivity(matrix, degreelist, clusters, sorted=True)
sorted_cluster_connectivity

*may want to change parameters below:*

In [None]:
# ## TODO: could modify so that original function can do cluster connectivity AND cluster connectivity based on if the user specifies an the added_proteins parameter
# def get_updated_cluster_connectivity(
#     matrix:ProteinMatrix,
#     degreelist:DegreeList,
#     clusters:AllClusters,
#     added_proteins:dict
# ):
#     proteins = matrix.get_list_of_proteins()
#     degree_dict = dict(degreelist.sorted_protein_degree_dict)
#     matrix_df = matrix.get_matrix()
#     cluster_connectivity = {}

#     for cluster_num in clusters.get_all_cluster_labels():
#         # get all the proteins associated to a cluster
#         cluster_proteins = clusters.get_cluster_proteins(cluster_num)
#         added_cluster_proteins = [] if cluster_num not in added_proteins else added_proteins[cluster_num]
#         # get the list of potential proteins to add to cluster 
#         submatrix = SubMatrix(list(set(cluster_proteins + added_cluster_proteins)), matrix)
#         components_and_labels = submatrix.get_num_components_and_labels()
#         num_components = components_and_labels[0]
#         # current ratio of clusters to proteins
#         num_proteins = len(cluster_proteins)
#         percent_connectivity = (num_proteins - num_components)/num_proteins
#         cluster_connectivity[cluster_num] = percent_connectivity
#     return cluster_connectivity



# first things to look at would be either: clusters initially 0, OR clusters w/ most improvement

In [None]:
# sqrt_cluster_connectivity = get_updated_cluster_connectivity(matrix, degreelist, clusters, sqrt_qualifying_proteins)
# sqrt_sorted_cluster_connectivity = {k: v for k, v in sorted(sqrt_cluster_connectivity.items(), key=lambda item: item[1])}
# sqrt_sorted_cluster_connectivity

three_sqrt_cluster_connectivity = get_updated_cluster_connectivity(matrix, degreelist, clusters, three_sqrt_qualifying_proteins)
three_sqrt_cluster_connectivity = {k: v for k, v in sorted(three_sqrt_cluster_connectivity.items(), key=lambda item: item[1])}
three_sqrt_cluster_connectivity

In [None]:
# difference_in_connectivity = {x: sqrt_cluster_connectivity[x] - cluster_connectivity[x] for x in cluster_connectivity if x in sqrt_cluster_connectivity}
# sorted_difference_in_connectivity = {k: v for k, v in sorted(difference_in_connectivity.items(), key=lambda item: item[1], reverse=True)}

# x = 53
# print("before:")
# print(cluster_connectivity[x])
# print("after:")
# print(sqrt_cluster_connectivity[x])

# sorted_difference_in_connectivity


three_difference_in_connectivity = {x: three_sqrt_cluster_connectivity[x] - cluster_connectivity[x] for x in cluster_connectivity if x in three_sqrt_cluster_connectivity}
three_sorted_difference_in_connectivity = {k: v for k, v in sorted(three_difference_in_connectivity.items(), key=lambda item: item[1], reverse=True)}



three_sorted_difference_in_connectivity






In [None]:
print(three_sqrt_qualifying_proteins[53])



In [None]:
DegreeList.get_degree_of_protein(degreelist, "KLHL20")

In [None]:
sqrt_qualifying_proteins

In [None]:
linear_qualifying_proteins

## Functional Enrichment Analysis

In [None]:
# create_term_mapping_list("data/go_labels/dream1_go.tsv", "data/go_labels/d1_term_mapping.tsv")
# create_term_mapping_list("data/go_labels/dream2_go.tsv", "data/go_labels/d2_term_mapping.tsv")
# create_term_mapping_list("data/go_labels/dream3_go.tsv", "data/go_labels/d3_term_mapping.tsv")

**will need to change below**

In [None]:
directory = "data/d1_ic10_medium/"

In [None]:
background_proteins_filepath = directory + "background_proteinlist.txt"
og_query_filepath = directory + "original_clusters.txt"
sqrt_query_filepath = directory + "sqrt_clusters.txt"
linear_query_filepath = directory + "linear_clusters.txt"

*print to files*

In [None]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)
# 1) a background protein list to be used for functional enrichment analysis
print_protein_background_to_file(matrix, background_proteins_filepath) 
# 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
background_go_terms = vocabs.getTerms(['GO'])

# continue printing info to files to be used in functional enrichment analysis:
# 4) print original clusters to a file
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), og_query_filepath)
# 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), sqrt_query_filepath, sqrt_qualifying_proteins)
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), linear_query_filepath, linear_qualifying_proteins)

*print to files*

In [None]:
original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
original_fe.importFiles({'query': og_query_filepath })
original_fe.run(cluster=False)

sqrt_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
sqrt_fe_all_proteins.importFiles({'query': sqrt_query_filepath })
sqrt_fe_all_proteins.run(cluster=False)

linear_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
linear_fe_all_proteins.importFiles({'query': linear_query_filepath })
linear_fe_all_proteins.run(cluster=False)

In [None]:
print(f"::::::::::::::::::::::::::SQRT::::::::::::::::::::::::::\n")
# print data by the numbers!
print(f"number of clusters that were updated with proteins: {len(sqrt_qualifying_proteins.keys())}\n")

# clusters that are functionally enriched
print(f"number of functionally enriched clusters (modules) out of {clusters.get_num_clusters()} original clusters: {original_fe.enrichment['Module'].nunique()}")
print(f"number of functionally enriched clusters after addition of all proteins: {sqrt_fe_all_proteins.enrichment['Module'].nunique()}")
print(f"")


#significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
print(f"number of significant pvals in clusters with all proteins added: {sqrt_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")


print(f"")
print(f"")

In [None]:
print(f"::::::::::::::::::::::::::LINEAR::::::::::::::::::::::::::\n")
# print data by the numbers!
print(f"number of clusters that were updated with proteins: {len(linear_qualifying_proteins.keys())}\n")

# clusters that are functionally enriched
print(f"number of functionally enriched clusters (modules) out of {clusters.get_num_clusters()} original clusters: {original_fe.enrichment['Module'].nunique()}")
print(f"number of functionally enriched clusters after addition of all proteins: {linear_fe_all_proteins.enrichment['Module'].nunique()}")
print(f"")

# significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
print(f"number of significant pvals in clusters with all proteins added: {linear_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")