In [31]:

from matrix_class import *
from cluster_class import *
from degreelist_class import *

In [32]:
actual_matrix_file = "../data/networks/DREAM_files/dream_3.txt"
cluster_file = "../data/testing_data/3344522.7320912.1_ppi_anonym_v2.txt"

In [33]:
matrix = ProteinMatrix(actual_matrix_file)

In [34]:
clusters = AllClusters(cluster_file)

In [35]:
degreelist = DegreeList(matrix)

In [36]:
print(f"Clusters:\n{clusters}")

Clusters:
AllClusters has 1024 clusters (use the print_all method to see them)


#### function to identify clusters that are connected enough

In [95]:
def find_clusters_that_match_criteria(matrix: ProteinMatrix, clusters: AllClusters, degreelist: DegreeList, ratio: float = .5, constant: int = 0) -> list():
    cluster_nums_that_qualify = list()

    for key in clusters.get_all_clusters():
        # create a submatrix out of the proteins in the cluster
        submatrix = SubMatrix(clusters.get_cluster_proteins(key), matrix)
        num_components, labels = submatrix.get_num_components_and_labels()
        # print(f"num components is {num_components}. num proteins is {len(submatrix.get_list_of_proteins())}")
        if num_components < ratio * len(submatrix.get_list_of_proteins()) + constant:
            # print('success')
            cluster_nums_that_qualify.append(key)

    return cluster_nums_that_qualify


result = find_clusters_that_match_criteria(matrix, clusters, degreelist, ratio=1)
print(result)

[0, 7, 8, 12, 14, 17, 21, 25, 26, 29, 30, 31, 35, 36, 37, 39, 40, 43, 44, 47, 49, 61, 63, 69, 73, 75, 84, 85, 86, 90, 95, 96, 98, 99, 104, 110, 111, 114, 115, 117, 121, 122, 123, 125, 126, 127, 131, 132, 139, 141, 142, 148, 149, 150, 151, 152, 153, 154, 156, 158, 159, 161, 164, 171, 174, 175, 179, 185, 189, 194, 195, 202, 206, 212, 215, 226, 233, 235, 239, 242, 247, 253, 257, 258, 269, 272, 273, 297, 306, 325, 333, 338, 360, 371, 372, 382, 392, 401, 403, 405, 409, 411, 418, 419, 420, 432, 433, 443, 448, 454, 461, 464, 468, 479, 493, 495, 500, 507, 516, 533, 534, 539, 540, 551, 554, 557, 558, 562, 571, 573, 575, 578, 589, 595, 596, 598, 602, 605, 606, 608, 614, 615, 616, 617, 619, 624, 627, 632, 633, 638, 640, 649, 650, 661, 666, 673, 674, 681, 688, 689, 695, 698, 700, 704, 711, 712, 717, 723, 728, 735, 738, 740, 741, 745, 750, 757, 764, 770, 774, 775, 776, 779, 780, 781, 788, 795, 803, 804, 805, 807, 815, 818, 822, 823, 829, 832, 837, 839, 848, 849, 860, 864, 873, 878, 880, 881, 883, 8

#### now, going to make a function that takes in a cluster, and determines proteins that would connect an appropriate number of components

In [72]:
def find_proteins_that_match_criteria(cluster_num: int, matrix: ProteinMatrix, clusters: AllClusters, degreelist: DegreeList, ratio: float = .5, constant: int = 0, min_components_that_protein_connects: int = -1, max_degree: int = 500) -> list():
    """
    can choose to a ratio and a constant, or a min num components
    a protein must connect more than ratio*num_components + constant
    TODO : function could be improved by passing (submatrix) info from the find_cluster_that_match function, but for now, this is ok. 
    """
    if (min_components_that_protein_connects == -1):
            # print(f"min_components_that_a_protein connects was not specified, will now be calculated based on the constant, and ratio times number of proteins ")
            min_components_that_protein_connects = constant + ratio * len(clusters.get_cluster_proteins(cluster_num))
        
    submatrix = SubMatrix(clusters.get_cluster_proteins(cluster_num), matrix)
    num_components, labels = submatrix.get_num_components_and_labels()


    ### POPULATE COMPONENT DICTIONARY ###
    component_dictionary = dict() # protein : component_num
    j = 0
    for array in [(np.array(submatrix.get_list_of_proteins())[np.nonzero(labels == i)]) for i in range(num_components)]:
        for protein in array:
            component_dictionary[protein] = j
        j += 1
    
    ## FIND CONNECTED PROTEINS AND DETERMINE IF THEY QUALIFY ###
    qualifying_proteins = list()

    for protein in (degreelist.get_list_of_proteins_sorted_by_degree()):   
        degree = matrix.find_degree(protein)

        if (degree >= min_components_that_protein_connects) and (degree <= max_degree):
            num_edges, which_proteins = degreelist.determine_num_edges_to_cluster(protein, clusters.get_cluster_proteins(cluster_num), also_return_which_proteins=True)
                
            if (num_edges >= min_components_that_protein_connects):
                    
                set_of_components_that_protein_connects = degreelist.which_components_of_a_cluster_would_a_protein_connect(protein, clusters.get_cluster_proteins(cluster_num), component_dictionary, connected_proteins_within_cluster=which_proteins)

                # print(f"\tthe components that {protein} connects: {set_of_components_that_protein_connects}")

                if len(set_of_components_that_protein_connects) >= min_components_that_protein_connects:
                    qualifying_proteins.append(protein)


    return qualifying_proteins

In [52]:
thing = find_proteins_that_match_criteria(result[0], matrix, clusters, degreelist, min_components_that_protein_connects=3, max_degree=500)
print(thing)

the components that ITGA6 connects: {3, 4}
the components that ITGB4 connects: {3, 4}
[]


In [53]:
thing = find_proteins_that_match_criteria(37, matrix, clusters, degreelist, min_components_that_protein_connects=3, max_degree=500)
print(thing)

the components that ACKR4 connects: {56, 40, 5}
the components that CORT connects: {65, 3, 36, 49, 55}
the components that CXCR3 connects: {24, 5, 45}
the components that MMP9 connects: {27, 51, 38}
the components that GNAO1 connects: {66, 3, 70, 49, 21}
the components that CXCR2 connects: {27, 51, 77, 38}
the components that GNAI3 connects: {66, 3, 4, 70, 49, 82, 21, 60}
['ACKR4', 'CORT', 'CXCR3', 'MMP9', 'GNAO1', 'CXCR2', 'GNAI3']


the following is a loop that takes ~15 minutes to find the qualifying proteins for each (qualifying) cluster

In [73]:
for cluster_num in result:
    print(f"CLUSTER {cluster_num}")
    res = find_proteins_that_match_criteria(cluster_num, matrix, clusters, degreelist, min_components_that_protein_connects=3, max_degree=500)
    if len(result) > 0:
        print(f"- - - - qualifying proteins for cluster {cluster_num}: {res}")


CLUSTER 12


KeyboardInterrupt: 

now i will proceed to find clusters that are decently connected (ratio of .7, so the number of components must be at least .7 * number of proteins)

In [68]:
new_result = find_clusters_that_match_criteria(matrix, clusters, degreelist, ratio=.7)

In [69]:
print(new_result)

[12, 114, 123, 161, 195, 257, 272, 273, 306, 333, 382, 392, 405, 411, 419, 432, 433, 468, 495, 516, 533, 540, 557, 571, 573, 578, 596, 598, 605, 614, 638, 640, 649, 661, 698, 700, 711, 723, 745, 750, 775, 776, 779, 781, 803, 807, 832, 864, 873, 900, 915, 920, 927, 935, 937, 945, 959, 974]


next, i will find the proteins (of degree < 200) that are pretty connected to the clusters (connected to at least 1/4 of components)

In [77]:
for cluster_num in new_result:
    print(f"checking cluster {cluster_num}")
    res = find_proteins_that_match_criteria(cluster_num, matrix, clusters, degreelist, max_degree=200, ratio=.25)
    if len(res) > 0:
        print(f"- - - - qualifying proteins for cluster {cluster_num}: {res}")

checking cluster 12
checking cluster 114
checking cluster 123
checking cluster 161
checking cluster 195
checking cluster 257
- - - - qualifying proteins for cluster 257: ['FZD1', 'LRP5', 'FZD3', 'LRP6']
checking cluster 272
- - - - qualifying proteins for cluster 272: ['PKD2']
checking cluster 273
checking cluster 306
checking cluster 333
checking cluster 382
checking cluster 392
checking cluster 405
checking cluster 411
checking cluster 419
checking cluster 432
- - - - qualifying proteins for cluster 432: ['IRAK4', 'IRAK1', 'TRAF6']
checking cluster 433
checking cluster 468
checking cluster 495
- - - - qualifying proteins for cluster 495: ['NOTCH1']
checking cluster 516
checking cluster 533
checking cluster 540
checking cluster 557
checking cluster 571
checking cluster 573
- - - - qualifying proteins for cluster 573: ['SOCS3', 'TYK2', 'JAK1', 'JAK2']
checking cluster 578
checking cluster 596
checking cluster 598
checking cluster 605
checking cluster 614
checking cluster 638
checking c

the one below has a stricter max degree cutoff (80)

In [81]:
for cluster_num in new_result:
    print(f"checking cluster {cluster_num}")
    res = find_proteins_that_match_criteria(cluster_num, matrix, clusters, degreelist, max_degree=80, ratio=.25)
    if len(res) > 0:
        print(f"- - - - qualifying proteins for cluster {cluster_num}: {res}")

checking cluster 12
checking cluster 114
checking cluster 123
checking cluster 161
checking cluster 195
checking cluster 257
- - - - qualifying proteins for cluster 257: ['FZD1', 'LRP5', 'FZD3', 'LRP6']
checking cluster 272
- - - - qualifying proteins for cluster 272: ['PKD2']
checking cluster 273
checking cluster 306
checking cluster 333
checking cluster 382
checking cluster 392
checking cluster 405
checking cluster 411
checking cluster 419
checking cluster 432
- - - - qualifying proteins for cluster 432: ['IRAK4', 'IRAK1']
checking cluster 433
checking cluster 468
checking cluster 495
checking cluster 516
checking cluster 533
checking cluster 540
checking cluster 557
checking cluster 571
checking cluster 573
- - - - qualifying proteins for cluster 573: ['SOCS3', 'TYK2', 'JAK1', 'JAK2']
checking cluster 578
checking cluster 596
checking cluster 598
checking cluster 605
checking cluster 614
checking cluster 638
checking cluster 640
checking cluster 649
- - - - qualifying proteins for c

the one below has proteins that are more connected


In [80]:
for cluster_num in new_result:
    print(f"checking cluster {cluster_num}")
    res = find_proteins_that_match_criteria(cluster_num, matrix, clusters, degreelist, max_degree=100, ratio=.45)
    if len(res) > 0:
        print(f"- - - - qualifying proteins for cluster {cluster_num}: {res}")
    

checking cluster 12
checking cluster 114
checking cluster 123
checking cluster 161
checking cluster 195
checking cluster 257
- - - - qualifying proteins for cluster 257: ['FZD3']
checking cluster 272
checking cluster 273
checking cluster 306
checking cluster 333
checking cluster 382
checking cluster 392
checking cluster 405
checking cluster 411
checking cluster 419
checking cluster 432
checking cluster 433
checking cluster 468
checking cluster 495
checking cluster 516
checking cluster 533
checking cluster 540
checking cluster 557
checking cluster 571
checking cluster 573
- - - - qualifying proteins for cluster 573: ['JAK2']
checking cluster 578
checking cluster 596
checking cluster 598
checking cluster 605
checking cluster 614
checking cluster 638
checking cluster 640
checking cluster 649
checking cluster 661
- - - - qualifying proteins for cluster 661: ['TRAF3', 'TRAF2']
checking cluster 698
checking cluster 700
checking cluster 711
checking cluster 723
checking cluster 745
checking c

In [88]:
def default_linear_fxn(num_proteins: int, ratio: float = .5, constant: int = 0):
    """
    Parameters: 
        num_proteins is the number of proteins in the cluster
        ratio and constant - used together to determine which clusters qualify, with the output of the function being ratio * input + constant
    Purpose:    determines an upper bound for the max number of components a 
                cluster should have to be deemed 'connected enough', linearly,based on the number of proteins in the cluster.
    Returns:    an upper bound for the numnber of components in the cluster
    """
    return (num_proteins * ratio + constant)



In [101]:
import json
dream3_cluster_file = "../data/results/DREAM-3-cc/d3_5_100.json-cluster.json"

dream3_clusters_dict = {}
# convert actual cluster file to a dictionary!!
with open(dream3_cluster_file,"r") as cluster_dict_file:
    dream3_clusters_dict = json.load(cluster_dict_file)

In [104]:
new_clusters = AllClusters(protein_to_cluster_dict=dream3_clusters_dict)


In [105]:
clusters_that_are_somewhat_connected = find_clusters_that_match_criteria(matrix, new_clusters, degreelist)


In [106]:
clusters_that_are_somewhat_connected

[272, 273, 411, 419, 468, 571, 598, 698, 711, 745, 750, 803, 915, 959]

In [111]:

dream3_matrix_file = "../data/networks/DREAM_files/dream_3.txt"
dream3_cluster_file = "../data/results/DREAM-3-cc/d3_5_100.json-cluster.json"

dream3_clusters_dict = {}
    # convert actual cluster file to a dictionary!!
with open(dream3_cluster_file,"r") as cluster_dict_file:
    dream3_clusters_dict = json.load(cluster_dict_file)


thing2 = find_clusters_that_match_criteria(ProteinMatrix(dream3_matrix_file), AllClusters(protein_to_cluster_dict=dream3_clusters_dict), DegreeList(ProteinMatrix(dream3_matrix_file)))


AllClusters has 1024 clusters (use the print_all method to see them)


KeyboardInterrupt: 

In [110]:
thing2

[272, 273, 411, 419, 468, 571, 598, 698, 711, 745, 750, 803, 915, 959]