
# d1_ic10_medium
- this file is for dream1 data, with medium clusters generated using the ic_10 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [1]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

 **must change filenames:**

In [2]:
interactions_file = "data/interactions/1_ppi_string_v2.txt"
clusters_file = "data/d1_ic10_medium/dream1_medium-cluster.json"

In [3]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath=interactions_file, clusters_filepath=clusters_file)

*may want to change parameters below:*

In [4]:
linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)

In [5]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)

In [6]:
sqrt_qualifying_proteins

{3: ['STAT5A', 'CBL'],
 4: ['KDSR',
  'GCAT',
  'TSTA3',
  'SDR42E2',
  'UXS1',
  'ALAS2',
  'SDR42E1',
  'GLDC',
  'NSDHL',
  'HSD3B7',
  'HSD3B2',
  'HSD3B1',
  'ALAS1',
  'SCD5'],
 5: ['SH3KBP1', 'TCF3'],
 6: ['ACADM', 'ACSM1', 'EHHADH', 'HADHA'],
 10: ['SLU7', 'POLR3K'],
 13: ['PTK2'],
 23: ['KAZN',
  'SYT4',
  'PPFIA4',
  'GAB3',
  'PPFIA2',
  'PPFIA3',
  'GAB4',
  'PPFIA1',
  'SAV1',
  'CDH2',
  'GAB2',
  'NR0B2',
  'GAB1',
  'SYT1',
  'MAP3K4',
  'LPP',
  'GRAPL',
  'PTPRM',
  'NCK2',
  'GRAP',
  'PDK3',
  'PDK4',
  'PDK2',
  'GRAP2',
  'BCKDK',
  'PTPN13',
  'PDK1',
  'TRIP6',
  'PTPRE',
  'PTPRA',
  'SPTBN4',
  'NCK1',
  'MAPK10'],
 33: ['SLC35E2B',
  'TAZ',
  'WDR26',
  'CLGN',
  'PTPLB',
  'STOML3',
  'TUSC3',
  'MAGT1',
  'STOML2',
  'DNAJC4',
  'DNAJC5G',
  'UGCG',
  'DNAJC11',
  'ZMPSTE24',
  'DNAJC5B',
  'DNAJC7',
  'DNAJC30',
  'RPN1',
  'RSPO4',
  'DNAJC24',
  'DNAJC16',
  'DNAJC22',
  'DNAJC3',
  'DNAJC5',
  'HAS3',
  'DNAJB8',
  'RAB20',
  'RSPO3',
  'RSPO1',
  'DNAJ

In [7]:
linear_qualifying_proteins

{4: ['KDSR',
  'GCAT',
  'TSTA3',
  'SDR42E2',
  'UXS1',
  'ALAS2',
  'SDR42E1',
  'GLDC',
  'NSDHL',
  'HSD3B7',
  'HSD3B2',
  'HSD3B1',
  'ALAS1',
  'SCD5'],
 23: ['KAZN',
  'SYT4',
  'PPFIA4',
  'GAB3',
  'PPFIA2',
  'PPFIA3',
  'GAB4',
  'PPFIA1',
  'SAV1',
  'CDH2',
  'GAB2',
  'NR0B2',
  'GAB1',
  'SYT1',
  'MAP3K4',
  'LPP',
  'GRAPL',
  'PTPRM',
  'NCK2',
  'GRAP',
  'PDK3',
  'PDK4',
  'PDK2',
  'GRAP2',
  'BCKDK',
  'PTPN13',
  'PDK1',
  'TRIP6',
  'PTPRE',
  'PTPRA',
  'SPTBN4',
  'NCK1',
  'MAPK10'],
 33: ['SLC35E2B',
  'TAZ',
  'WDR26',
  'CLGN',
  'PTPLB',
  'STOML3',
  'TUSC3',
  'MAGT1',
  'STOML2',
  'DNAJC4',
  'DNAJC5G',
  'UGCG',
  'DNAJC11',
  'ZMPSTE24',
  'DNAJC5B',
  'DNAJC7',
  'DNAJC30',
  'RPN1',
  'RSPO4',
  'DNAJC24',
  'DNAJC16',
  'DNAJC22',
  'DNAJC3',
  'DNAJC5',
  'HAS3',
  'DNAJB8',
  'RAB20',
  'RSPO3',
  'RSPO1',
  'DNAJC18',
  'HAS2',
  'DNAJB14',
  'DNAJB7',
  'DNAJC25',
  'HAS1',
  'DNAJB12',
  'DNAJB2',
  'PFDN4',
  'DNAJC25-GNG10',
  'DNAJB9',
