
# d2_ic10_medium
- this file is for dream2 data, with medium clusters generated using the ic_10 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [1]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

 **must change filenames:**

In [2]:
interactions_file = "data/interactions/2_ppi_inweb_v2.txt"
clusters_file = "data/d2_ic10_medium/dream2_medium-cluster.json"

In [3]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath=interactions_file, clusters_filepath=clusters_file)

*may want to change parameters below:*

In [4]:
linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)

In [5]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)

In [6]:
sqrt_qualifying_proteins

{2: ['LASP1',
  'THOC3',
  'ICK',
  'ASH2L',
  'VASP',
  'EXOSC9',
  'ENAH',
  'PSEN1',
  'GRPEL1',
  'PAK1IP1',
  'ZNF326',
  'DDX47',
  'SIP1',
  'ELAVL1',
  'ACACA',
  'CORO6',
  'EIF2C1',
  'THOC2',
  'ATXN1',
  'STK38L',
  'EIF2C2',
  'SART3',
  'HIPK3',
  'DHX36',
  'C3orf26',
  'PDHB',
  'TRIP12',
  'DDOST',
  'BRCA1',
  'GEMIN4',
  'CD3EAP',
  'RPL7P9',
  'SRPK2',
  'ILF2',
  'PCBP2',
  'RAB1A',
  'ARHGAP32',
  'DDX50',
  'DGKG',
  'DMD',
  'DHRS2',
  'TEX10',
  'BRSK1',
  'DHX30',
  'EBNA1BP2',
  'ATP6V0D1',
  'S100A9',
  'RPL29',
  'ULK3',
  'PRPF3',
  'HIST1H1C',
  'DICER1',
  'RPL37A',
  'HNRNPL',
  'ACTN2',
  'CSNK1G3',
  'HSPA6',
  'ACTN1',
  'GTF3C4',
  'SLC25A6',
  'PHB2',
  'PHB',
  'KIF3A',
  'PPM1B',
  'BMP2K',
  'H1FX',
  'GTF3C1',
  'TUBA1C',
  'POLR1E',
  'TUBA3D',
  'TUBA3C',
  'FASN',
  'MAP3K1',
  'HDLBP',
  'UPF1',
  'EIF4A3',
  'DDX39B',
  'BAT1',
  'RPL28',
  'EEF1A2',
  'DDX20',
  'RSL1D1',
  'EPRS',
  'COPA',
  'SLC25A5',
  'RPS26P8',
  'RPS26P10',
  'RBM1

In [7]:
linear_qualifying_proteins

{2: ['LASP1',
  'THOC3',
  'ICK',
  'ASH2L',
  'VASP',
  'EXOSC9',
  'ENAH',
  'PSEN1',
  'GRPEL1',
  'PAK1IP1',
  'ZNF326',
  'DDX47',
  'SIP1',
  'ELAVL1',
  'ACACA',
  'CORO6',
  'EIF2C1',
  'THOC2',
  'ATXN1',
  'STK38L',
  'EIF2C2',
  'SART3',
  'HIPK3',
  'DHX36',
  'C3orf26',
  'PDHB',
  'TRIP12',
  'DDOST',
  'BRCA1',
  'GEMIN4',
  'CD3EAP',
  'RPL7P9',
  'SRPK2',
  'ILF2',
  'PCBP2',
  'RAB1A',
  'ARHGAP32',
  'DDX50',
  'DGKG',
  'DMD',
  'DHRS2',
  'TEX10',
  'BRSK1',
  'DHX30',
  'EBNA1BP2',
  'ATP6V0D1',
  'S100A9',
  'RPL29',
  'ULK3',
  'PRPF3',
  'HIST1H1C',
  'DICER1',
  'RPL37A',
  'HNRNPL',
  'ACTN2',
  'CSNK1G3',
  'HSPA6',
  'ACTN1',
  'GTF3C4',
  'SLC25A6',
  'PHB2',
  'PHB',
  'KIF3A',
  'PPM1B',
  'BMP2K',
  'H1FX',
  'GTF3C1',
  'TUBA1C',
  'POLR1E',
  'TUBA3D',
  'TUBA3C',
  'FASN',
  'MAP3K1',
  'HDLBP',
  'UPF1',
  'EIF4A3',
  'DDX39B',
  'BAT1',
  'RPL28',
  'EEF1A2',
  'DDX20',
  'RSL1D1',
  'EPRS',
  'COPA',
  'SLC25A5',
  'RPS26P8',
  'RPS26P10',
  'RBM1