In [27]:

from matrix_class import *
from cluster_class import *
from degreelist_class import *
import json 

In [2]:
testing_matrix_file = "../data/testing_data/fake_cluster_dream.txt"
testing_cluster_file = "../data/testing_data/fake_cluster.txt"

dream3_matrix_file = "../data/networks/DREAM_files/dream_3.txt"
dream3_cluster_file = "../data/results/DREAM-3-cc/d3_5_100.json-cluster.json" 

dream3_clusters_dict = {}
# convert actual cluster file to a dictionary!!
with open(dream3_cluster_file,"r") as cluster_dict_file:
    dream3_clusters_dict = json.load(cluster_dict_file)

In [3]:
testing_matrix = ProteinMatrix(testing_matrix_file)
testing_clusters = AllClusters(testing_cluster_file)
testing_degreelist = DegreeList(testing_matrix)

In [4]:
clusters = AllClusters(protein_to_cluster_dict=dream3_clusters_dict)
matrix = ProteinMatrix(dream3_matrix_file)
degreelist = DegreeList(matrix)

In [5]:
def find_clusters_and_proteins_together(matrix: ProteinMatrix, clusters: AllClusters, degreelist: DegreeList, cluster_ratio: float = .5, cluster_constant: int = 0, protein_ratio: float = .5, protein_constant: int = 0, min_components_that_protein_connects: int = -1, max_degree: int = 500) -> list() and dict():
    """
    function is a version of find_clusters_that_match_criteria, that, once it finds the cluster, finds corresponding proteins at the same time so that the submatrix doesn't need to be reconstructed

    Parameters: 
        matrix - a ProteinMatrix of all protein interactions
        clusters - an AllClusters containing proteins grouped into clusters
        cluster_ratio and cluster_constant - used together to determine which clusters qualify, with the output of the function being cluster_ratio * input + cluster_constant
        TODO: remaining parameters
    Purpose:    determines clusters that are mostly highly connected, then 
                determines which proteins that, when added to the cluster, will 
                increase it's connectedness
    Returns:    a list containing the numbers of the clusters that qualify, and 
                a dictionary linking each cluster, to a list of the qualifying 
                proteins
    """
    
    cluster_nums_that_qualify = list()
    qualifying_proteins_dict = dict()

    for cluster_num in clusters.get_all_clusters():
        # create a submatrix out of the proteins in the cluster
        submatrix = SubMatrix(clusters.get_cluster_proteins(cluster_num), matrix)
        num_components, labels = submatrix.get_num_components_and_labels()
        # print(f"num components is {num_components}. num proteins is {len(submatrix.get_list_of_proteins())}")
        if num_components < cluster_ratio * len(submatrix.get_list_of_proteins()) + cluster_constant:

            # add cluster to list showing that it qualifies, 
            cluster_nums_that_qualify.append(cluster_num)
            # then do analysis on the cluster
            qualifying_proteins_dict[cluster_num] = qualifying_proteins_using_submatrix(cluster_num, submatrix, clusters, degreelist, ratio=protein_ratio, constant=protein_constant, min_components_that_protein_connects=min_components_that_protein_connects, max_degree=max_degree)


    return cluster_nums_that_qualify, qualifying_proteins_dict


def qualifying_proteins_using_submatrix(cluster_num: int, submatrix: SubMatrix, clusters: AllClusters, degreelist: DegreeList, ratio: float = .5, constant: int = 0, min_components_that_protein_connects: int = -1, max_degree: int = 500) -> list():
    """
    TODO : a revised version of the find_proteins_that_match_criteria function that takes in a submatrix as a parameter, and therefore doesn't need to construct one. 
    """
    if (min_components_that_protein_connects == -1):
            min_components_that_protein_connects = constant + ratio * len(clusters.get_cluster_proteins(cluster_num))
        
    num_components, labels = submatrix.get_num_components_and_labels()

    ### POPULATE COMPONENT DICTIONARY ###
    component_dictionary = dict() # protein : component_num
    j = 0
    for array in [(np.array(submatrix.get_list_of_proteins())[np.nonzero(labels == i)]) for i in range(num_components)]:
        for protein in array:
            component_dictionary[protein] = j
        j += 1
    
    ## FIND CONNECTED PROTEINS AND DETERMINE IF THEY QUALIFY 
    qualifying_proteins = list()

    for protein in (degreelist.get_list_of_proteins_sorted_by_degree()):   
        num_edges, which_proteins = degreelist.determine_num_edges_to_cluster(protein, clusters.get_cluster_proteins(cluster_num), also_return_which_proteins=True)
                
        if (num_edges >= min_components_that_protein_connects):
            set_of_components_that_protein_connects = degreelist.which_components_of_a_cluster_would_a_protein_connect(protein, clusters.get_cluster_proteins(cluster_num), component_dictionary, connected_proteins_within_cluster=which_proteins)

            if len(set_of_components_that_protein_connects) >= min_components_that_protein_connects:
                qualifying_proteins.append(protein)

    return qualifying_proteins


def pick_ratio(num_clusters: int):
    """
    will determine an approximate ratio to start with based on the total number of clusters
    """
    if (num_clusters > 1000):
        return .5
    elif num_clusters > 500:
        return .7
    elif num_clusters > 200:
        return .9
    elif num_clusters > 100:
        return .925
    elif num_clusters > 50:
        return .995
    else: 
        return 1

In [6]:
qualifying_clusters, qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=pick_ratio(clusters.get_num_clusters()), protein_ratio=.05, protein_constant=2)

In [7]:
print(f"{qualifying_clusters}")

[25, 29, 30, 36, 49, 73, 75, 99, 152, 161, 164, 174, 185]


In [8]:
print(f"{qualifying_proteins}")

{25: ['AKT1', 'SRC'], 29: ['CDK1', 'PRKACA'], 30: [], 36: [], 49: ['EGFR', 'MAPK1', 'SRC'], 73: ['TP53', 'MAPK1', 'PRKCA', 'SRC'], 75: [], 99: [], 152: ['CSNK2A1', 'PRKACA'], 161: ['CSNK2A1'], 164: ['BCL2', 'MAPK8', 'MAPK1'], 174: [], 185: ['TSC1', 'IKBKB']}


In [13]:
import copy

In [29]:
def create_new_clusters(clusters_to_qualifying_proteins: dict(), csv_filename: str = "", protein_to_cluster_dict: dict() = {}, original_clusters: AllClusters = AllClusters(), ) -> AllClusters:
    """
    csv_filename: str = "", protein_to_cluster_dict: dict() ={}, original_clusters: AllClusters = AllClusters(), are all different ways to pass in info to make new clusters (and you should choose one of them). Please note, that if you use Original Clusters, the original clusters will be modified to include the new qualifying proteins
    """
    modified_clusters = AllClusters()
    
    if csv_filename != "":
        modified_clusters = AllClusters(csv_filename=csv_filename)
    elif protein_to_cluster_dict: # dictionary not empty
        modified_clusters = AllClusters(protein_to_cluster_dict=protein_to_cluster_dict)
    else:
        modified_clusters = original_clusters
    
    for key in clusters_to_qualifying_proteins:
        if clusters_to_qualifying_proteins[key]: # will return true if this cluster has qualifying proteins (list not empty)
            for protein in clusters_to_qualifying_proteins[key]:
                modified_clusters.add_protein_to_cluster(protein, key)
    return modified_clusters

ERROR! please specify a [csv_filename] or a [protein_to_cluster_dict] not found.


In [34]:
new_clusters = create_new_clusters(qualifying_proteins)

ERROR! please specify a [csv_filename] or a [protein_to_cluster_dict] not found.


In [38]:
print(new_clusters)

AllClusters has 199 clusters (use the print_all method to see them)


In [37]:
print(clusters.get_cluster_proteins(25))
print(new_clusters.get_cluster_proteins(25))

['TRPV3', 'KHDRBS2', 'PDZD2', 'TP73', 'TICAM1', 'PHKG1', 'KITLG', 'TBK1', 'RAC1', 'RUSC1', 'CBS', 'MAPK11', 'MAP2K6', 'PAWR', 'CDK10', 'SPTAN1', 'TRPV3', 'KHDRBS2', 'PDZD2', 'TP73', 'TICAM1', 'PHKG1', 'KITLG', 'TBK1', 'RAC1', 'RUSC1', 'CBS', 'MAPK11', 'MAP2K6', 'PAWR', 'CDK10', 'SPTAN1', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC']
['TRPV3', 'KHDRBS2', 'PDZD2', 'TP73', 'TICAM1', 'PHKG1', 'KITLG', 'TBK1', 'RAC1', 'RUSC1', 'CBS', 'MAPK11', 'MAP2K6', 'PAWR', 'CDK10', 'SPTAN1', 'TRPV3', 'KHDRBS2', 'PDZD2', 'TP73', 'TICAM1', 'PHKG1', 'KITLG', 'TBK1', 'RAC1', 'RUSC1', 'CBS', 'MAPK11', 'MAP2K6', 'PAWR', 'CDK10', 'SPTAN1', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC', 'AKT1', 'SRC']


In [12]:
print(clusters.get_cluster_proteins(185))
print(new_clusters.get_cluster_proteins(185))

['PPP1R15A', 'MMP14', 'BCL2L12', 'PTX3', 'HLA-C', 'KAT8', 'EIF2S2', 'RASSF7', 'AKT2', 'PRKAA2', 'PIP5K1A', 'RAB11A', 'CASP3', 'PPP1R15A', 'MMP14', 'BCL2L12', 'PTX3', 'HLA-C', 'KAT8', 'EIF2S2', 'RASSF7', 'AKT2', 'PRKAA2', 'PIP5K1A', 'RAB11A', 'CASP3']
['PPP1R15A', 'MMP14', 'BCL2L12', 'PTX3', 'HLA-C', 'KAT8', 'EIF2S2', 'RASSF7', 'AKT2', 'PRKAA2', 'PIP5K1A', 'RAB11A', 'CASP3', 'PPP1R15A', 'MMP14', 'BCL2L12', 'PTX3', 'HLA-C', 'KAT8', 'EIF2S2', 'RASSF7', 'AKT2', 'PRKAA2', 'PIP5K1A', 'RAB11A', 'CASP3']
