In [1]:

# this file is for dream1 data, with SMALL clusters generated using the ic_50 parameter (spectral clustering)
# we add proteins back if they connect > half of all components in the cluster


In [2]:
# importing libraries

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs

In [4]:
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

In [6]:
# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

In [8]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath="data/interactions/1_ppi_string_v2.txt", clusters_filepath="data/ic_50/dream1_small-cluster.json")

In [9]:
#qualifying_clusters, qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=.5, protein_constant=0)
qualifying_clusters_5, qualifying_proteins_5 = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)


In [10]:
qualifying_proteins_5

{1: ['ADD3',
  'ADD2',
  'ADD1',
  'APIP',
  'KYNU',
  'AOX1',
  'COQ3',
  'NT5E',
  'TTR',
  'CNDP1',
  'DPYS',
  'SORD',
  'CNDP2',
  'ADSL',
  'ALDH1L2',
  'ALDH1L1',
  'ALDH1A2',
  'APRT',
  'ALDH2',
  'ALDH1A3',
  'ALDH1B1',
  'ALDH1A1',
  'LDHAL6B'],
 4: ['EPT1',
  'CEPT1',
  'UCK2',
  'ELOVL7',
  'KMO',
  'ELOVL1',
  'ELOVL2',
  'ELOVL4',
  'DHCR7',
  'FASN',
  'ELOVL5',
  'COQ6',
  'LSS',
  'GADL1',
  'AGPAT3',
  'AGPAT4',
  'LCLAT1',
  'SQLE',
  'PEMT',
  'CSAD',
  'AGXT',
  'AGPAT2',
  'TSTA3',
  'SDR42E2',
  'UXS1',
  'SDR42E1',
  'NSDHL',
  'HSD3B7',
  'HSD3B2',
  'HMGCS1',
  'FDPS',
  'HSD3B1',
  'B3GNTL1',
  'IDH3G',
  'ALG5',
  'IDH3B',
  'PIGA',
  'DPM1',
  'IDH3A',
  'PDHB',
  'VAPB',
  'VAPA',
  'ABCB11'],
 6: ['ELK4',
  'ELF1',
  'CEBPB',
  'ID2',
  'ETV6',
  'KSR2',
  'KSR1',
  'BRAF',
  'ARAF',
  'MAPK10',
  'MAPK9'],
 8: ['NUDT16',
  'PDE8A',
  'CANT1',
  'PDE6D',
  'QPRT',
  'TYMP',
  'DCTPP1',
  'PDE3A',
  'PDE6H',
  'PDE1C',
  'CDA',
  'PDE1B',
  'ENTPD4',
  'P

In [11]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)


# 1) a background protein list to be used for functional enrichment analysis
background_proteins_filepath = "data/ic_50/background_proteinlist.txt"
print_protein_background_to_file(matrix, background_proteins_filepath) 


In [None]:

# 2) file containing protein to go term
term_mapping_filepath = "data/ic_50/term_mapping.txt"
create_term_mapping_list("demo_data/dream3_go_results.tsv", term_mapping_filepath)

# 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
background_go_terms = vocabs.getTerms(['GO'])

In [None]:
# continue printing info to files to be used in functional enrichment analysis:

# 4) print original clusters to a file
og_query_filepath = 'demo_data/original_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(),og_query_filepath)

# 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
updated_query_filepath = 'demo_data/updated_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), updated_query_filepath, qualifying_proteins)

In [None]:
original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

original_fe.importFiles({'query': og_query_filepath })
original_fe.run(cluster=False)

In [None]:
# do functional enrichment on updated clusters!!

updated_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

updated_fe_all_proteins.importFiles({'query': updated_query_filepath })
updated_fe_all_proteins.run(cluster=False)

In [None]:
# print data by the numbers!
print(f"number of clusters that were updated with proteins: {len(qualifying_proteins.keys())}\n")

# clusters that are functionally enriched
print(f"number of functionally enriched clusters (modules) out of {len(qualifying_proteins.keys())} original clusters: {original_fe.enrichment['Module'].nunique()}")
print(f"number of functionally enriched clusters after addition of all proteins: {updated_fe_all_proteins.enrichment['Module'].nunique()}")

print(f"")


# significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
print(f"number of significant pvals in clusters with all proteins added: {updated_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")

number of clusters that were updated with proteins: 12

number of functionally enriched clusters (modules) out of 12 original clusters: 155
number of functionally enriched clusters after addition of all proteins: 155

number of significant pvals in original clusters before updating: 405
number of significant pvals in clusters with all proteins added: 411
