In [3]:

from matrix_class import *
from cluster_class import *
from degreelist_class import *
import json 

In [4]:
    testing_matrix_file = "../data/testing_data/fake_cluster_dream.txt"
    testing_cluster_file = "../data/testing_data/fake_cluster.txt"

    dream3_matrix_file = "../data/networks/DREAM_files/dream_3.txt"
    dream3_cluster_file = "../data/results/DREAM-3-cc/d3_5_100.json-cluster.json" 

    dream3_clusters_dict = {}
    # convert actual cluster file to a dictionary!!
    with open(dream3_cluster_file,"r") as cluster_dict_file:
        dream3_clusters_dict = json.load(cluster_dict_file)

In [5]:
testing_matrix = ProteinMatrix(testing_matrix_file)
testing_clusters = AllClusters(testing_cluster_file)
testing_degreelist = DegreeList(testing_matrix)

In [16]:
clusters = AllClusters(protein_to_cluster_dict=dream3_clusters_dict)
matrix = ProteinMatrix(dream3_matrix_file)
degreelist = DegreeList(matrix)

In [50]:
def qualifying_proteins_using_submatrix(cluster_num: int, submatrix: SubMatrix, clusters: AllClusters, degreelist: DegreeList, ratio: float = .5, constant: int = 0, min_components_that_protein_connects: int = -1, max_degree: int = 500) -> list():
    """
    TODO : a revised version of the find_proteins_that_match_criteria function that takes in a submatrix as a parameter, and therefore doesn't need to construct one. 
    """
    if (min_components_that_protein_connects == -1):
            min_components_that_protein_connects = constant + ratio * len(clusters.get_cluster_proteins(cluster_num))
        
    num_components, labels = submatrix.get_num_components_and_labels()

    ### POPULATE COMPONENT DICTIONARY ###
    component_dictionary = dict() # protein : component_num
    j = 0
    for array in [(np.array(submatrix.get_list_of_proteins())[np.nonzero(labels == i)]) for i in range(num_components)]:
        for protein in array:
            component_dictionary[protein] = j
        j += 1
    
    ## FIND CONNECTED PROTEINS AND DETERMINE IF THEY QUALIFY 
    qualifying_proteins = list()

    for protein in (degreelist.get_list_of_proteins_sorted_by_degree()):   
        num_edges, which_proteins = degreelist.determine_num_edges_to_cluster(protein, clusters.get_cluster_proteins(cluster_num), also_return_which_proteins=True)
                
        if (num_edges >= min_components_that_protein_connects):
            set_of_components_that_protein_connects = degreelist.which_components_of_a_cluster_would_a_protein_connect(protein, clusters.get_cluster_proteins(cluster_num), component_dictionary, connected_proteins_within_cluster=which_proteins)

            if len(set_of_components_that_protein_connects) >= min_components_that_protein_connects:
                qualifying_proteins.append(protein)

    return qualifying_proteins

In [29]:
def find_clusters_and_proteins_together(matrix: ProteinMatrix, clusters: AllClusters, degreelist: DegreeList, cluster_ratio: float = .5, cluster_constant: int = 0, protein_ratio: float = .5, protein_constant: int = 0, min_components_that_protein_connects: int = -1, max_degree: int = 500) -> list() and dict():
    """
    function is a version of find_clusters_that_match_criteria, that, once it finds the cluster, finds corresponding proteins at the same time so that the submatrix doesn't need to be reconstructed

    Parameters: 
        matrix - a ProteinMatrix of all protein interactions
        clusters - an AllClusters containing proteins grouped into clusters
        cluster_ratio and cluster_constant - used together to determine which clusters qualify, with the output of the function being cluster_ratio * input + cluster_constant
        TODO: remaining parameters
    Purpose:    determines clusters that are mostly highly connected, then 
                determines which proteins that, when added to the cluster, will 
                increase it's connectedness
    Returns:    a list containing the numbers of the clusters that qualify, and 
                a dictionary linking each cluster, to a list of the qualifying 
                proteins
    """
    
    cluster_nums_that_qualify = list()
    qualifying_proteins_dict = dict()

    for cluster_num in clusters.get_all_clusters():
        # create a submatrix out of the proteins in the cluster
        submatrix = SubMatrix(clusters.get_cluster_proteins(cluster_num), matrix)
        num_components, labels = submatrix.get_num_components_and_labels()
        # print(f"num components is {num_components}. num proteins is {len(submatrix.get_list_of_proteins())}")
        if num_components < cluster_ratio * len(submatrix.get_list_of_proteins()) + cluster_constant:

            # add cluster to list showing that it qualifies, 
            print(f"adding cluster {cluster_num}")
            cluster_nums_that_qualify.append(cluster_num)
            # then do analysis on the cluster
            # qualifying_proteins_dict[cluster_num] = qualifying_proteins_using_submatrix(cluster_num, submatrix, clusters, degreelist, ratio=protein_ratio, constant=protein_constant, min_components_that_protein_connects=min_components_that_protein_connects, max_degree=max_degree)


    return cluster_nums_that_qualify, qualifying_proteins_dict


In [43]:
def find_clusters_that_match_criteria(matrix: ProteinMatrix, clusters: AllClusters, degreelist: DegreeList, ratio: float = .5, constant: int = 0) -> list():
    """
    Parameters: 
        matrix - a ProteinMatrix of all protein interactions
        clusters - an AllClusters containing proteins grouped into clusters
        ratio and constant - used together to determine which clusters qualify, with the output of the function being ratio * input + constant
    Purpose:    function goes through all clusters and finds those that are 
                'connected enough'. to qualify, the number of components in the 
                cluster must be less the value of the qualifying_threshhold 
                function when the number of proteins in the cluster is passed in
    Returns:    a list containing the numbers of the clusters that qualify
    """
    
    cluster_nums_that_qualify = list()

    for key in clusters.get_all_clusters():
        # create a submatrix out of the proteins in the cluster
        submatrix = SubMatrix(clusters.get_cluster_proteins(key), matrix)
        num_components, labels = submatrix.get_num_components_and_labels()
        # print(f"num components is {num_components}. num proteins is {len(submatrix.get_list_of_proteins())}")
        if num_components < ratio * len(submatrix.get_list_of_proteins()) + constant:
            # print('success')
            cluster_nums_that_qualify.append(key)

    return cluster_nums_that_qualify


def find_proteins_that_match_criteria(cluster_num: int, matrix: ProteinMatrix, clusters: AllClusters, degreelist: DegreeList, ratio: float = .5, constant: int = 0, min_components_that_protein_connects: int = -1, max_degree: int = 500) -> list():
    """
    can choose to a ratio and a constant, or a min num components
    a protein must connect more than ratio*num_components + constant
    TODO : function could be improved by passing (submatrix) info from the find_cluster_that_match function, but for now, this is ok. (so that it doesn't need to be reconstructed)
    """
    if (min_components_that_protein_connects == -1):
            min_components_that_protein_connects = constant + ratio * len(clusters.get_cluster_proteins(cluster_num))
        
    submatrix = SubMatrix(clusters.get_cluster_proteins(cluster_num), matrix)
    num_components, labels = submatrix.get_num_components_and_labels()


    ### POPULATE COMPONENT DICTIONARY ###
    component_dictionary = dict() # protein : component_num
    j = 0
    for array in [(np.array(submatrix.get_list_of_proteins())[np.nonzero(labels == i)]) for i in range(num_components)]:
        for protein in array:
            component_dictionary[protein] = j
        j += 1
    
    ## FIND CONNECTED PROTEINS AND DETERMINE IF THEY QUALIFY ###
    qualifying_proteins = list()

    for protein in (degreelist.get_list_of_proteins_sorted_by_degree()):   
        degree = matrix.find_degree(protein)

        if (degree >= min_components_that_protein_connects) and (degree <= max_degree):
            num_edges, which_proteins = degreelist.determine_num_edges_to_cluster(protein, clusters.get_cluster_proteins(cluster_num), also_return_which_proteins=True)
                
            if (num_edges >= min_components_that_protein_connects):
                set_of_components_that_protein_connects = degreelist.which_components_of_a_cluster_would_a_protein_connect(protein, clusters.get_cluster_proteins(cluster_num), component_dictionary, connected_proteins_within_cluster=which_proteins)

                if len(set_of_components_that_protein_connects) >= min_components_that_protein_connects:
                    qualifying_proteins.append(protein)

    return qualifying_proteins


In [35]:
find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=.95) ### -> PASS! does the same as the original fxn

adding cluster 7
adding cluster 25
adding cluster 29
adding cluster 30
adding cluster 36
adding cluster 49
adding cluster 61
adding cluster 73
adding cluster 75
adding cluster 86
adding cluster 99
adding cluster 110
adding cluster 121
adding cluster 132
adding cluster 150
adding cluster 151
adding cluster 152
adding cluster 161
adding cluster 164
adding cluster 174
adding cluster 185
adding cluster 189
adding cluster 194
adding cluster 195


([7,
  25,
  29,
  30,
  36,
  49,
  61,
  73,
  75,
  86,
  99,
  110,
  121,
  132,
  150,
  151,
  152,
  161,
  164,
  174,
  185,
  189,
  194,
  195],
 {})

In [36]:
find_clusters_that_match_criteria(matrix, clusters, degreelist, ratio=.95)

[7,
 25,
 29,
 30,
 36,
 49,
 61,
 73,
 75,
 86,
 99,
 110,
 121,
 132,
 150,
 151,
 152,
 161,
 164,
 174,
 185,
 189,
 194,
 195]

In [37]:
find_clusters_and_proteins_together(testing_matrix, testing_clusters, testing_degreelist, cluster_ratio=1.1)

NameError: name 'testing_matrix' is not defined

In [75]:
def find_clusters_and_proteins_together(matrix: ProteinMatrix, clusters: AllClusters, degreelist: DegreeList, cluster_ratio: float = .5, cluster_constant: int = 0, protein_ratio: float = .5, protein_constant: int = 0, min_components_that_protein_connects: int = -1, max_degree: int = 500) -> list() and dict():
    """
    function is a version of find_clusters_that_match_criteria, that, once it finds the cluster, finds corresponding proteins at the same time so that the submatrix doesn't need to be reconstructed

    Parameters: 
        matrix - a ProteinMatrix of all protein interactions
        clusters - an AllClusters containing proteins grouped into clusters
        cluster_ratio and cluster_constant - used together to determine which clusters qualify, with the output of the function being cluster_ratio * input + cluster_constant
        TODO: remaining parameters
    Purpose:    determines clusters that are mostly highly connected, then 
                determines which proteins that, when added to the cluster, will 
                increase it's connectedness
    Returns:    a list containing the numbers of the clusters that qualify, and 
                a dictionary linking each cluster, to a list of the qualifying 
                proteins
    """
    
    cluster_nums_that_qualify = list()
    qualifying_proteins_dict = dict()

    for cluster_num in clusters.get_all_clusters():
        # create a submatrix out of the proteins in the cluster
        submatrix = SubMatrix(clusters.get_cluster_proteins(cluster_num), matrix)
        num_components, labels = submatrix.get_num_components_and_labels()
        # print(f"num components is {num_components}. num proteins is {len(submatrix.get_list_of_proteins())}")
        if num_components < cluster_ratio * len(submatrix.get_list_of_proteins()) + cluster_constant:

            # add cluster to list showing that it qualifies, 
            cluster_nums_that_qualify.append(cluster_num)
            # then do analysis on the cluster
            qualifying_proteins_dict[cluster_num] = qualifying_proteins_using_submatrix(cluster_num, submatrix, clusters, degreelist, ratio=protein_ratio, constant=protein_constant, min_components_that_protein_connects=min_components_that_protein_connects, max_degree=max_degree)


    return cluster_nums_that_qualify, qualifying_proteins_dict


In [52]:
find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=.90, min_components_that_protein_connects=3)

adding cluster 25
adding cluster 152
adding cluster 161
adding cluster 185


([25, 152, 161, 185],
 {25: ['AKT1', 'SRC'],
  152: ['PPM1A',
   'PPP2CA',
   'CDK2',
   'GSK3B',
   'MAPK3',
   'CSNK2A1',
   'PRKACA',
   'SRC'],
  161: ['SNCA',
   'LRP6',
   'RPS6KA1',
   'NFE2L2',
   'ESR1',
   'GRB2',
   'PLK1',
   'STAT3',
   'MAPK8',
   'ABL1',
   'MAPK14',
   'GSK3B',
   'TP53',
   'AKT1',
   'CSNK2A1',
   'CDK1',
   'PRKCA'],
  185: ['TSC1', 'IKBKB']})

In [73]:
find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=.95, protein_ratio=.05, protein_constant=2)

adding cluster 7
adding cluster 25
adding cluster 29
adding cluster 30
adding cluster 36
adding cluster 49
adding cluster 61
adding cluster 73
adding cluster 75
adding cluster 86
adding cluster 99
adding cluster 110
adding cluster 121
adding cluster 132
adding cluster 150
adding cluster 151
adding cluster 152
adding cluster 161
adding cluster 164
adding cluster 174
adding cluster 185
adding cluster 189
adding cluster 194
adding cluster 195


([7,
  25,
  29,
  30,
  36,
  49,
  61,
  73,
  75,
  86,
  99,
  110,
  121,
  132,
  150,
  151,
  152,
  161,
  164,
  174,
  185,
  189,
  194,
  195],
 {7: ['CSNK2A1', 'MAPK1', 'SRC'],
  25: ['AKT1', 'SRC'],
  29: ['CDK1', 'PRKACA'],
  30: [],
  36: [],
  49: ['EGFR', 'MAPK1', 'SRC'],
  61: ['GSK3B'],
  73: ['TP53', 'MAPK1', 'PRKCA', 'SRC'],
  75: [],
  86: [],
  99: [],
  110: ['PRKACA', 'SRC'],
  121: [],
  132: ['TP53', 'CDK1'],
  150: [],
  151: ['CSNK2A1', 'CDK1', 'PRKACA', 'SRC'],
  152: ['CSNK2A1', 'PRKACA'],
  161: ['CSNK2A1'],
  164: ['BCL2', 'MAPK8', 'MAPK1'],
  174: [],
  185: ['TSC1', 'IKBKB'],
  189: ['AKT1', 'SRC'],
  194: ['PRKCA', 'SRC'],
  195: []})

In [54]:
find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=.90, protein_ratio=.2, protein_constant=3)

adding cluster 25
adding cluster 152
adding cluster 161
adding cluster 185


([25, 152, 161, 185], {25: [], 152: [], 161: [], 185: []})

In [62]:
find_proteins_that_match_criteria(25, matrix, clusters, degreelist, ratio=.05, constant=2)

['AKT1', 'SRC']

In [63]:
find_proteins_that_match_criteria(29, matrix, clusters, degreelist, ratio=.05, constant=2)

['CDK1', 'PRKACA']

In [67]:
find_proteins_that_match_criteria(30, matrix, clusters, degreelist, ratio=.01, constant=2)

['TP53']

In [71]:
find_proteins_that_match_criteria(49, matrix, clusters, degreelist, ratio=.05, constant=2)

['EGFR', 'MAPK1', 'SRC']

In [76]:
find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=.95, protein_ratio=.05, protein_constant=2)

([7,
  25,
  29,
  30,
  36,
  49,
  61,
  73,
  75,
  86,
  99,
  110,
  121,
  132,
  150,
  151,
  152,
  161,
  164,
  174,
  185,
  189,
  194,
  195],
 {7: ['CSNK2A1', 'MAPK1', 'SRC'],
  25: ['AKT1', 'SRC'],
  29: ['CDK1', 'PRKACA'],
  30: [],
  36: [],
  49: ['EGFR', 'MAPK1', 'SRC'],
  61: ['GSK3B'],
  73: ['TP53', 'MAPK1', 'PRKCA', 'SRC'],
  75: [],
  86: [],
  99: [],
  110: ['PRKACA', 'SRC'],
  121: [],
  132: ['TP53', 'CDK1'],
  150: [],
  151: ['CSNK2A1', 'CDK1', 'PRKACA', 'SRC'],
  152: ['CSNK2A1', 'PRKACA'],
  161: ['CSNK2A1'],
  164: ['BCL2', 'MAPK8', 'MAPK1'],
  174: [],
  185: ['TSC1', 'IKBKB'],
  189: ['AKT1', 'SRC'],
  194: ['PRKCA', 'SRC'],
  195: []})