
# recipe_ic50_small_clusters
- this file is for dream1 data, with small clusters generated using the ic_50 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [1]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

In [2]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath="data/interactions/1_ppi_string_v2.txt", clusters_filepath="data/ic_50_small/dream1_small-cluster.json")

In [3]:
linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=4,use_sqrt=False, protein_ratio=.5, protein_constant=0)
linear_qualifying_proteins

{8: ['NUDT16',
  'PDE8A',
  'CANT1',
  'PDE6D',
  'QPRT',
  'TYMP',
  'DCTPP1',
  'PDE3A',
  'PDE6H',
  'PDE1C',
  'CDA',
  'PDE1B',
  'ENTPD4',
  'PDE6G',
  'PDE1A',
  'NAMPTL',
  'UPP2',
  'ENTPD6',
  'ENTPD5',
  'UPP1',
  'PNP',
  'NAPRT1',
  'AMPD2',
  'AMPD3',
  'ENTPD8',
  'ENPP3',
  'ENTPD3',
  'ENTPD1',
  'ENPP1',
  'AMPD1',
  'NMNAT3',
  'NMNAT2',
  'DCTD',
  'NUDT12',
  'NMNAT1',
  'AK5',
  'CMPK1',
  'ADK',
  'AK7',
  'HPRT1',
  'AK1',
  'AK8',
  'AK4',
  'ADA',
  'DCK',
  'ADSL',
  'AK2',
  'APRT',
  'ADSSL1',
  'ATIC',
  'ADSS',
  'ITPA',
  'CMPK2'],
 9: ['TGFA', 'MAGI2', 'GRASP', 'VGLL2', 'SFRP1', 'MAGI3', 'PLCG1'],
 13: ['BMI1'],
 58: ['TRIM23'],
 59: ['MCRS1',
  'E2F6',
  'DNMT1',
  'HIRA',
  'ING4',
  'ING2',
  'ING3',
  'ING5',
  'ING1',
  'CFDP1',
  'SUPT4H1',
  'ZNHIT1',
  'YEATS2',
  'TOX4',
  'BAZ1B',
  'BAZ2B',
  'BMI1',
  'DDB1',
  'YEATS4',
  'MLLT3'],
 63: ['HINT1'],
 64: ['PRTFDC1', 'CTU1', 'HPRT1', 'SLMAP'],
 67: ['AGK',
  'TRMT12',
  'PLA2G4F',
  'PLA2G4E',

In [4]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=4,use_sqrt=True, protein_ratio=1, protein_constant=0)
sqrt_qualifying_proteins

{0: ['DBN1',
  'GNG10',
  'GNG4',
  'GNG8',
  'GNG5',
  'GNG7',
  'GNG13',
  'GNGT2',
  'GNG11',
  'GNG3',
  'GNG12',
  'CACNA1G',
  'GNB5',
  'GRIN2A',
  'CACNA1B',
  'GRIN2B',
  'GRIN1',
  'CACNA1A',
  'CACNA1D',
  'CACNA1C',
  'CACNA1S',
  'CHUK',
  'IKBKB',
  'CCNT1',
  'GNG2'],
 2: ['ARHGEF2', 'VAV3', 'VAV2', 'VAV1'],
 8: ['NUDT16',
  'PDE8A',
  'CANT1',
  'PDE6D',
  'QPRT',
  'TYMP',
  'DCTPP1',
  'PDE3A',
  'PDE6H',
  'PDE1C',
  'CDA',
  'PDE1B',
  'ENTPD4',
  'PDE6G',
  'PDE1A',
  'NAMPTL',
  'UPP2',
  'ENTPD6',
  'ENTPD5',
  'UPP1',
  'PNP',
  'NAPRT1',
  'AMPD2',
  'AMPD3',
  'ENTPD8',
  'ENPP3',
  'ENTPD3',
  'ENTPD1',
  'ENPP1',
  'AMPD1',
  'NMNAT3',
  'NMNAT2',
  'DCTD',
  'NUDT12',
  'NMNAT1',
  'AK5',
  'CMPK1',
  'ADK',
  'AK7',
  'HPRT1',
  'AK1',
  'AK8',
  'AK4',
  'ADA',
  'DCK',
  'ADSL',
  'AK2',
  'APRT',
  'ADSSL1',
  'ATIC',
  'ADSS',
  'ITPA',
  'CMPK2'],
 9: ['TGFA', 'MAGI2', 'GRASP', 'VGLL2', 'SFRP1', 'MAGI3', 'PLCG1'],
 12: ['CORO6', 'CORO2B', 'CORO2A', 'C

In [11]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)
# 1) a background protein list to be used for functional enrichment analysis
background_proteins_filepath = "data/ic_50_small/background_proteinlist.txt"
print_protein_background_to_file(matrix, background_proteins_filepath) 

# 2) file containing protein to go term
term_mapping_filepath = "data/go_labels/dream1_go.tsv"
create_term_mapping_list("data/go_labels/TEST_dream1_go.tsv", term_mapping_filepath)

# 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
background_go_terms = vocabs.getTerms(['GO'])

# continue printing info to files to be used in functional enrichment analysis:

# 4) print original clusters to a file
og_query_filepath = 'data/ic_50_small/original_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), og_query_filepath)

# 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
sqrt_query_filepath = 'data/ic_50_small/sqrt_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), sqrt_query_filepath, sqrt_qualifying_proteins)

linear_query_filepath = 'data/ic_50_small/linear_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), linear_query_filepath, linear_qualifying_proteins)

In [None]:
original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

original_fe.importFiles({'query': og_query_filepath })
original_fe.run(cluster=False)

In [None]:
# do functional enrichment on updated clusters!!

updated_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

updated_fe_all_proteins.importFiles({'query': updated_query_filepath })
updated_fe_all_proteins.run(cluster=False)

In [None]:
# print data by the numbers!
print(f"number of clusters that were updated with proteins: {len(qualifying_proteins.keys())}\n")

# clusters that are functionally enriched
print(f"number of functionally enriched clusters (modules) out of {len(qualifying_proteins.keys())} original clusters: {original_fe.enrichment['Module'].nunique()}")
print(f"number of functionally enriched clusters after addition of all proteins: {updated_fe_all_proteins.enrichment['Module'].nunique()}")

print(f"")


# significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
print(f"number of significant pvals in clusters with all proteins added: {updated_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")

number of clusters that were updated with proteins: 12

number of functionally enriched clusters (modules) out of 12 original clusters: 155
number of functionally enriched clusters after addition of all proteins: 155

number of significant pvals in original clusters before updating: 405
number of significant pvals in clusters with all proteins added: 411
