In [1]:

# this file is for dream1 data, with MEDIUM clusters generated using the ic_50 parameter (spectral clustering)
# we add proteins back if they connect > Half of all components in the cluster


In [2]:
# importing libraries

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs

In [4]:
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

In [6]:
# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

In [7]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath="data/interactions/1_ppi_string_v2.txt", clusters_filepath="data/ic_50_medium/dream1_medium-cluster.json")

In [8]:
#qualifying_clusters, qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=.5, protein_constant=0)
qualifying_clusters, qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=0.5, protein_constant=0)


In [9]:
qualifying_proteins

{0: ['PDE4DIP',
  'WDR1',
  'AP2A2',
  'CCNL1',
  'USP34',
  'AP2A1',
  'MAGI1',
  'HECTD3',
  'KIAA0317',
  'PFDN4',
  'ARRB1',
  'SEC13',
  'BCCIP',
  'PRPF4',
  'RNF40'],
 5: ['NLRP4',
  'RIN1',
  'RABGGTA',
  'LRPAP1',
  'RAP1GAP',
  'SAFB2',
  'ZBP1',
  'SCMH1',
  'RCAN2',
  'RCAN3',
  'SCML2',
  'FADD',
  'TGM2',
  'APTX',
  'RCAN1',
  'PHF16',
  'IRF3',
  'PHF15',
  'INHBB',
  'RNMT',
  'PHF17',
  'MLLT4',
  'DCPS',
  'CHURC1-FNTB',
  'ARR3',
  'SMTNL2',
  'CHML',
  'FNTB',
  'CENPF',
  'SPECC1L',
  'SPECC1',
  'SMTNL1',
  'SMTN',
  'MAGIX',
  'CASP8',
  'SNCA',
  'WWC2',
  'CHM',
  'ACVR1',
  'TBK1',
  'HMGCR',
  'VAV2',
  'MED7',
  'GDI1',
  'WWC1',
  'PAPOLB',
  'PAPOLG',
  'UBE3B',
  'UBE3C',
  'STAT2',
  'SPATA13',
  'PDK1',
  'MAGI1',
  'TAF10',
  'CASP3',
  'HECTD3',
  'HECTD1',
  'KIAA0317',
  'PAPOLA',
  'TGFBR1',
  'ARRB1',
  'BTF3L4',
  'PDPK1',
  'TGFB1',
  'CRNKL1',
  'GDI2',
  'MAGI3',
  'DQX1',
  'DHX32',
  'HECTD2',
  'UBR5',
  'EXOSC2',
  'ACAT1',
  'DHX35',
  '

In [10]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)


# 1) a background protein list to be used for functional enrichment analysis
background_proteins_filepath = "data/ic_50/background_proteinlist.txt"
print_protein_background_to_file(matrix, background_proteins_filepath) 


FileNotFoundError: [Errno 2] No such file or directory: 'data/ic_50/background_proteinlist.txt'

In [None]:

# 2) file containing protein to go term
term_mapping_filepath = "data/ic_50/term_mapping.txt"
create_term_mapping_list("demo_data/dream3_go_results.tsv", term_mapping_filepath)

# 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
background_go_terms = vocabs.getTerms(['GO'])

In [None]:
# continue printing info to files to be used in functional enrichment analysis:

# 4) print original clusters to a file
og_query_filepath = 'demo_data/original_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(),og_query_filepath)

# 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
updated_query_filepath = 'demo_data/updated_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), updated_query_filepath, qualifying_proteins)

In [None]:
original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

original_fe.importFiles({'query': og_query_filepath })
original_fe.run(cluster=False)

In [None]:
# do functional enrichment on updated clusters!!

updated_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

updated_fe_all_proteins.importFiles({'query': updated_query_filepath })
updated_fe_all_proteins.run(cluster=False)

In [None]:
# print data by the numbers!
print(f"number of clusters that were updated with proteins: {len(qualifying_proteins.keys())}\n")

# clusters that are functionally enriched
print(f"number of functionally enriched clusters (modules) out of {len(qualifying_proteins.keys())} original clusters: {original_fe.enrichment['Module'].nunique()}")
print(f"number of functionally enriched clusters after addition of all proteins: {updated_fe_all_proteins.enrichment['Module'].nunique()}")

print(f"")


# significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
print(f"number of significant pvals in clusters with all proteins added: {updated_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")

number of clusters that were updated with proteins: 12

number of functionally enriched clusters (modules) out of 12 original clusters: 155
number of functionally enriched clusters after addition of all proteins: 155

number of significant pvals in original clusters before updating: 405
number of significant pvals in clusters with all proteins added: 411
