
# d2_ic10_small
- this file is for dream2 data, with small clusters generated using the ic_10 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [1]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

 **must change filenames:**

In [2]:
interactions_file = "data/interactions/2_ppi_inweb_v2.txt"
clusters_file = "data/d2_ic10_small/dream2_small-cluster.json"

In [3]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath=interactions_file, clusters_filepath=clusters_file)

*may want to change parameters below:*

In [4]:
linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)

In [5]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)

In [6]:
sqrt_qualifying_proteins

{0: ['LYN', 'PIK3R1'],
 2: ['CRK', 'PLCG1', 'NCK1', 'PIK3R1', 'FGFR1', 'ABL1', 'PTK2'],
 4: ['HDAC11',
  'MEF2C',
  'DLL1',
  'RBPJ',
  'TBL1X',
  'NCOA1',
  'SRF',
  'YY1',
  'VARS2',
  'GTF2H4',
  'NCOA3',
  'TCEA1P2',
  'MTA1',
  'PSMB7',
  'GTF2H3',
  'SNW1',
  'PSMB6',
  'GTF2H2',
  'SUPT4H1',
  'GTF2H1',
  'NFKB1',
  'ERCC2',
  'ESR1',
  'SUPT16H',
  'SUPT5H',
  'GTF2F2',
  'CREBBP',
  'MLLT4',
  'RELA',
  'POLR2G',
  'ARAF',
  'POLR2J',
  'ERCC3',
  'POLR2I',
  'TBP',
  'CCNH',
  'PSMC5',
  'PSMB1',
  'MNAT1',
  'POLR2D',
  'PSMD1',
  'POLR2H',
  'POLR2F',
  'EPRS',
  'POLR2L',
  'POLR2C'],
 5: ['SUMO1P3'],
 7: ['SMAD2', 'ESR1', 'EP300', 'CREBBP', 'HDAC1'],
 10: ['CTNNB1'],
 17: ['EP300', 'CREBBP'],
 19: ['UNC119', 'SETDB1', 'TP53'],
 23: ['ARID4B', 'CASP8', 'CASP3', 'HSP90AA2'],
 24: ['PML'],
 35: ['BMPR1B', 'SMAD4', 'TGFBR1'],
 50: ['FN1'],
 55: ['PIK3R1'],
 57: ['LCK'],
 61: ['ATP2C1'],
 63: ['MAPK3'],
 69: ['HDAC3'],
 73: ['EP300', 'CREBBP', 'HDAC1'],
 78: ['VCL', 'FN1'],
 8

In [7]:
linear_qualifying_proteins

{4: ['HDAC11',
  'MEF2C',
  'DLL1',
  'RBPJ',
  'TBL1X',
  'NCOA1',
  'SRF',
  'YY1',
  'VARS2',
  'GTF2H4',
  'NCOA3',
  'TCEA1P2',
  'MTA1',
  'PSMB7',
  'GTF2H3',
  'SNW1',
  'PSMB6',
  'GTF2H2',
  'SUPT4H1',
  'GTF2H1',
  'NFKB1',
  'ERCC2',
  'ESR1',
  'SUPT16H',
  'SUPT5H',
  'GTF2F2',
  'CREBBP',
  'MLLT4',
  'RELA',
  'POLR2G',
  'ARAF',
  'POLR2J',
  'ERCC3',
  'POLR2I',
  'TBP',
  'CCNH',
  'PSMC5',
  'PSMB1',
  'MNAT1',
  'POLR2D',
  'PSMD1',
  'POLR2H',
  'POLR2F',
  'EPRS',
  'POLR2L',
  'POLR2C'],
 7: ['EP300'],
 19: ['UNC119', 'SETDB1', 'TP53'],
 50: ['FN1'],
 81: ['PIK3R1'],
 85: ['PQLC1', 'SORCS2', 'SORT1', 'RAB6C', 'RAB6B'],
 103: ['SAPS1',
  'ATM',
  'SUPT6H',
  'SMAD4',
  'SART1',
  'PHLPPL',
  'CREBBP',
  'HLA-B',
  'CHEK2',
  'CBS',
  'CSNK1A1L',
  'KIF3A',
  'ACO2',
  'PRPF8',
  'HIST1H2BD',
  'H2BFS',
  'HIST2H2BA',
  'HIST1H2BJ',
  'HIST3H2BB',
  'HDLBP',
  'HIST1H2BB',
  'LOC440926',
  'H3F3B',
  'UPF1',
  'HIST1H2BM',
  'HIST2H2BF',
  'IMPDH1',
  'CCT2',
  'P