
# d3_ic10_small
- this file is for dream3 data, with small clusters generated using the ic_10 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [3]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

 **must change filenames:**

In [4]:
interactions_file = "data/interactions/3_signal_omnipath_directed_v3.txt"
clusters_file = "data/d3_ic10_small/dream3_small-cluster.json"

In [5]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath=interactions_file, clusters_filepath=clusters_file)

*may want to change parameters below:*

In [6]:
linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)

In [7]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)

In [8]:
sqrt_qualifying_proteins

{0: ['PRKCA'],
 1: ['SRC'],
 2: ['EGFR'],
 3: ['PLK1'],
 4: ['CSNK2A1'],
 5: ['FYN', 'EGFR', 'SRC'],
 6: ['LCP2',
  'HCK',
  'CRKL',
  'CRK',
  'VAV1',
  'ERBB4',
  'PDGFRB',
  'PLCG1',
  'PTPN1',
  'PTPN11',
  'LYN',
  'SHC1',
  'LCK',
  'GRB2',
  'ABL1',
  'EGFR',
  'MAPK1',
  'SRC'],
 7: ['LCP2',
  'LAT',
  'MET',
  'CRKL',
  'CRK',
  'MAP3K1',
  'TGFBR1',
  'SYK',
  'RAF1',
  'GRB2',
  'MAPK8',
  'ABL1',
  'MAPK1'],
 8: ['ABL2', 'ABL1', 'SRC'],
 9: ['KDR',
  'LAT',
  'CTNND1',
  'HCK',
  'MET',
  'RET',
  'PDGFRB',
  'CSK',
  'PLCG1',
  'INSR',
  'CBL',
  'PTPN11',
  'LYN',
  'LCK',
  'GRB2',
  'PIK3R1',
  'FYN',
  'ABL1',
  'EGFR',
  'MAPK3',
  'MAPK1',
  'SRC'],
 10: ['MYC', 'ATM', 'CDK2', 'TP53', 'CDK1'],
 11: ['CSNK2A1'],
 12: ['TRADD',
  'TNFRSF1A',
  'RIPK1',
  'IRAK1',
  'TRAF2',
  'CHUK',
  'IKBKB',
  'TRAF6'],
 13: ['STK3',
  'CDK4',
  'STK4',
  'AURKB',
  'ATR',
  'MAP3K7',
  'MAPK8',
  'ATM',
  'CDK2',
  'CDK1'],
 14: ['MAPK14', 'MAPK3', 'MAPK1'],
 15: ['MAPK14', 'MAPK3'

In [9]:
linear_qualifying_proteins

{0: ['PRKCA'],
 1: ['SRC'],
 2: ['EGFR'],
 3: ['PLK1'],
 4: ['CSNK2A1'],
 5: ['FYN', 'EGFR', 'SRC'],
 6: ['LCP2',
  'HCK',
  'CRKL',
  'CRK',
  'VAV1',
  'ERBB4',
  'PDGFRB',
  'PLCG1',
  'PTPN1',
  'PTPN11',
  'LYN',
  'SHC1',
  'LCK',
  'GRB2',
  'ABL1',
  'EGFR',
  'MAPK1',
  'SRC'],
 7: ['LCP2',
  'LAT',
  'MET',
  'CRKL',
  'CRK',
  'MAP3K1',
  'TGFBR1',
  'SYK',
  'RAF1',
  'GRB2',
  'MAPK8',
  'ABL1',
  'MAPK1'],
 8: ['ABL1', 'SRC'],
 9: ['KDR',
  'LAT',
  'CTNND1',
  'HCK',
  'MET',
  'RET',
  'PDGFRB',
  'CSK',
  'PLCG1',
  'INSR',
  'CBL',
  'PTPN11',
  'LYN',
  'LCK',
  'GRB2',
  'PIK3R1',
  'FYN',
  'ABL1',
  'EGFR',
  'MAPK3',
  'MAPK1',
  'SRC'],
 11: ['CSNK2A1'],
 12: ['TRADD', 'TNFRSF1A', 'TRAF6'],
 13: ['ATR', 'ATM', 'CDK2'],
 14: ['MAPK3', 'MAPK1'],
 15: ['MAPK14', 'MAPK3', 'MAPK1'],
 17: ['EP300'],
 20: ['MAP2K4', 'MAPK14'],
 34: ['CSNK2A1'],
 35: ['EP300', 'MAPK8', 'AR', 'MAPK14', 'TP53', 'MAPK3', 'AKT1', 'MAPK1'],
 36: ['MAPK3', 'MAPK1'],
 37: ['PRKACA'],
 38: ['MA

In [20]:
# from recipe_utils import get_cluster_connectivity

from matrix_class import SubMatrix
def get_cluster_connectivity (
    matrix:ProteinMatrix,
    degreelist:DegreeList,
    clusters:AllClusters,
    added_proteins:dict={},
    sorted:bool=False,

):
    """
    returns a dictionary of cluster_num : percent_connectivity
    note: uses SubMatrix from matrix class

    can specify if you want sorted.
    if added_proteins is specified (not empty), then it will add those proteins to the cluster before calculating connectivity

    """
    proteins = matrix.get_list_of_proteins()
    degree_dict = dict(degreelist.sorted_protein_degree_dict)
    matrix_df = matrix.get_matrix()
    cluster_connectivity = {}

    for cluster_num in clusters.get_all_cluster_labels():
        # get all the proteins associated to a cluster  
        cluster_proteins = clusters.get_cluster_proteins(cluster_num)
        
        # added_cluster_proteins is empty in the case that none have been added, or if added proteins was not specified
        added_cluster_proteins = [] if not added_proteins or cluster_num not in added_proteins else added_proteins[cluster_num]
        # get the list of potential proteins to add to cluster 
        submatrix = SubMatrix(list(set(cluster_proteins + added_cluster_proteins)), matrix)
        components_and_labels = submatrix.get_num_components_and_labels()
        num_components = components_and_labels[0]

        # current ratio of clusters to proteins
        num_proteins = len(cluster_proteins)
        percent_connectivity = (num_proteins - num_components)/num_proteins
        cluster_connectivity[cluster_num] = percent_connectivity

    print(cluster_connectivity)
    if sorted:
        sorted_cluster_connectivity = {k: v for k, v in sorted(cluster_connectivity.items(), key=lambda item: item[1])}
        print(sorted_cluster_connectivity)
        return sorted_cluster_connectivity
    
    return cluster_connectivity


sorted_cluster_connectivity = get_cluster_connectivity(matrix, degreelist, clusters, {}, sorted=True)
sorted_cluster_connectivity


{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.5454545454545454, 5: 0.23076923076923078, 6: 0.6, 7: 0.8461538461538461, 8: 0.0, 9: 0.7647058823529411, 10: 0.2857142857142857, 11: 0.23076923076923078, 12: 0.4117647058823529, 13: 0.38461538461538464, 14: 0.0, 15: 0.0, 16: 0.36666666666666664, 17: 0.0, 18: 0.2, 19: 0.0, 20: 0.18181818181818182, 21: 0.1111111111111111, 22: 0.0, 23: 0.19230769230769232, 24: 0.2647058823529412, 25: 0.08333333333333333, 26: 0.15217391304347827, 27: 0.5, 28: 0.16666666666666666, 29: 0.0, 30: 0.05, 31: 0.0, 32: 0.19047619047619047, 33: 0.5925925925925926, 34: 0.08333333333333333, 35: 0.6296296296296297, 36: 0.5357142857142857, 37: 0.35, 38: 0.23809523809523808, 39: 0.5454545454545454, 40: 0.2, 41: 0.36363636363636365, 42: 0.0, 43: 0.0, 44: 0.0, 45: 0.0, 46: 0.36363636363636365, 47: 0.0, 48: 0.1, 49: 0.0, 50: 0.0, 51: 0.6363636363636364, 52: 0.03571428571428571, 53: 0.0, 54: 0.08333333333333333, 55: 0.1, 56: 0.0625, 57: 0.05263157894736842, 58: 0.0, 59: 0.052631578947368

TypeError: 'bool' object is not callable

In [None]:
# from recipe_utils import get_cluster_connectivity
cluster_connectivity = get_cluster_connectivity(matrix, degreelist, clusters)
sorted_cluster_connectivity

In [None]:
from recipe_utils import top_n_proteins
three_sqrt_qualifying = top_n_proteins(sqrt_qualifying_clusters, n=3)
three_sqrt_qualifying

In [None]:
from recipe_utils import get_cluster_connectivity
sorted_cluster_connectivity = get_cluster_connectivity(matrix, degreelist, clusters, sorted=True)
sorted_cluster_connectivity