In [1]:
# creating results after creating new fxn that utilizes the ratio of the number 
# of components, instead of the number of proteins in a cluster

In [2]:
# importing libraries

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs

In [4]:
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

In [6]:
# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

In [7]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist("demo_data/dream3.txt", "demo_data/dream3_other.txt")

In [8]:
qualifying_clusters, qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)

# cluster const = 2 -> must have at least 2 componenets



In [9]:
print(qualifying_proteins)

{7: ['IFNAR2', 'IFNAR1'], 8: ['MAPK1'], 21: ['GNAQ', 'SRC'], 36: ['PRKACA'], 49: ['IL20RA', 'IL6ST'], 65: ['MAPK9', 'MAPK8', 'MAPK14', 'MAPK3', 'MAPK1'], 114: ['AR'], 115: ['TICAM2', 'MAP3K4', 'CASP1', 'CDK1'], 123: ['RAC1', 'MAPK1', 'PRKCA'], 150: ['PAK1'], 151: ['IL20', 'IL24', 'IL22', 'IL2RG', 'JAK3', 'TYK2', 'STAT5A', 'JAK1', 'JAK2', 'STAT3'], 171: ['EFNA2', 'EFNA1', 'SRC'], 176: ['STK4'], 192: ['IRF9', 'WAS', 'NOTCH1'], 206: ['GNAQ'], 235: ['TYROBP'], 238: ['FASLG'], 243: ['ARNTL', 'RXRB'], 248: ['INSR', 'LCK', 'FYN', 'EGFR', 'MAPK3', 'MAPK1', 'SRC'], 250: ['PPP2CA'], 251: ['SRC'], 253: ['FGFR4', 'FGFR3', 'FGFR2', 'FGFR1'], 257: ['FZD4', 'FZD1', 'ROR2', 'LRP5', 'FZD3', 'LRP6'], 258: ['BMP2'], 269: ['PER2', 'PER1'], 272: ['PKD2'], 273: ['TCEB1', 'TYK2', 'KIT', 'JAK1', 'JAK2', 'EGFR'], 275: ['PRKCA'], 281: ['CAMK2A', 'FYN', 'PRKCA', 'SRC'], 293: ['FAS'], 296: ['GNAI3'], 297: ['LHCGR', 'TSHR'], 310: ['PRKACA', 'PRKCA'], 316: ['RHOA', 'CDC42', 'RAC1', 'SRC'], 319: ['IGF1'], 321: ['ERB

In [10]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)


# 1) a background protein list to be used for functional enrichment analysis
background_proteins_filepath = "demo_data/background_proteinlist.txt"
print_protein_background_to_file(matrix, background_proteins_filepath) 

# 2) file containing protein to go term
term_mapping_filepath = "demo_data/term_mapping.txt"
create_term_mapping_list("demo_data/dream3_go_results.tsv", term_mapping_filepath)

# 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
background_go_terms = vocabs.getTerms(['GO'])

ConnectionError: HTTPConnectionPool(host='purl.obolibrary.org', port=80): Max retries exceeded with url: /obo/go.obo (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x178120880>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [None]:
# continue printing info to files to be used in functional enrichment analysis:

# 4) print original clusters to a file
og_query_filepath = 'demo_data/original_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(),og_query_filepath)

# 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
updated_query_filepath = 'demo_data/updated_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), updated_query_filepath, qualifying_proteins)

In [None]:
original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

original_fe.importFiles({'query': og_query_filepath })
original_fe.run(cluster=False)

In [None]:
# do functional enrichment on updated clusters!!

updated_fe_all_proteins = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)

updated_fe_all_proteins.importFiles({'query': updated_query_filepath })
updated_fe_all_proteins.run(cluster=False)

In [None]:
# print data by the numbers!
print(f"number of clusters that were updated with proteins: {len(qualifying_proteins.keys())}\n")

# clusters that are functionally enriched
print(f"number of functionally enriched clusters (modules) out of {len(qualifying_proteins.keys())} original clusters: {original_fe.enrichment['Module'].nunique()}")
print(f"number of functionally enriched clusters after addition of all proteins: {updated_fe_all_proteins.enrichment['Module'].nunique()}")

print(f"")


# significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
print(f"number of significant pvals in clusters with all proteins added: {updated_fe_all_proteins.enrichment['Fishers_pvalue'].count()}")

number of clusters that were updated with proteins: 12

number of functionally enriched clusters (modules) out of 12 original clusters: 155
number of functionally enriched clusters after addition of all proteins: 155

number of significant pvals in original clusters before updating: 405
number of significant pvals in clusters with all proteins added: 411
