d3_ic50_small

In [1]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

 **must change filenames:**

In [2]:
interactions_file = "data/interactions/3_signal_omnipath_directed_v3.txt"
clusters_file = "data/d3_ic50_small/d3_ic50_small.json"

In [3]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath=interactions_file, clusters_filepath=clusters_file)

*may want to change parameters below:*

In [4]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2, use_sqrt=True, protein_ratio=1, protein_constant=0)

In [5]:
sqrt_qualifying_proteins

{30: ['CDK1'],
 40: ['PRKCA'],
 43: ['PRKCA'],
 45: ['PRKCA'],
 54: ['PRKACA'],
 64: ['PRKACA'],
 76: ['CDK1'],
 79: ['PRKACA'],
 89: ['CSNK2A1'],
 91: ['SRC'],
 103: ['MAPK1'],
 104: ['SRC'],
 140: ['CSNK2A1'],
 142: ['CDK1'],
 156: ['PRKACA']}

In [6]:
# from recipe_utils import top_n_proteins

# three_qualifying_proteins = top_n_proteins(sqrt_qualifying_proteins, n=3) 
# three_qualifying_proteins

In [7]:
# from recipe_utils import get_cluster_connectivity

# original_cluster_connectivity = get_cluster_connectivity(matrix, degreelist, clusters, {}, sort_it=False)
# new_cluster_connectivity = get_cluster_connectivity(matrix, degreelist, clusters, added_proteins=sqrt_qualifying_proteins, sort_it=False)

# from recipe_utils import calculate_connectivity_difference

# difference = calculate_connectivity_difference(original_cluster_connectivity, new_cluster_connectivity, sort_it=True)
# difference

### Functional Enrichment Analysis

In [8]:
# create_term_mapping_list("data/go_labels/dream3_go.tsv", "data/go_labels/d3_term_mapping.tsv")

**will need to the directory below**

In [9]:
directory = "data/d3_ic50_small/"
term_mapping_filepath = "data/go_labels/d3_term_mapping.tsv"

In [10]:
background_proteins_filepath = directory + "background_proteinlist.txt"
og_query_filepath = directory + "original_clusters.txt"
sqrt_query_filepath = directory + "sqrt_clusters.txt"
# three_query_filepath = directory + "three_prot_clusters.txt"

*print to files*

In [11]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)
# 1) a background protein list to be used for functional enrichment analysis
print_protein_background_to_file(matrix, background_proteins_filepath) 
# 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
background_go_terms = vocabs.getTerms(['GO'])

# continue printing info to files to be used in functional enrichment analysis:
# 4) print original clusters to a file
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), og_query_filepath)
# 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), sqrt_query_filepath, sqrt_qualifying_proteins)
# print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), three_query_filepath, three_qualifying_proteins)

*RUN Functional Enrichment:*

In [12]:
original_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
original_fe.importFiles({'query': og_query_filepath })
original_fe.run(cluster=False)

sqrt_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
sqrt_fe.importFiles({'query': sqrt_query_filepath })
sqrt_fe.run(cluster=False)

# three_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
# three_fe.importFiles({'query': three_query_filepath })
# three_fe.run(cluster=False)

In [13]:
print(f"::::::::::::::::::::::::::SQRT::::::::::::::::::::::::::\n")
# print data by the numbers!
print(f"number of clusters that were updated with proteins: {len(sqrt_qualifying_proteins.keys())}\n")

# clusters that are functionally enriched
print(f"number of functionally enriched clusters (modules) out of {clusters.get_num_clusters()} original clusters: {original_fe.enrichment['Module'].nunique()}")
print(f"number of functionally enriched clusters after addition of all proteins: {sqrt_fe.enrichment['Module'].nunique()}")
print(f"")


#significant P-values!!!
print(f"number of significant pvals in original clusters before updating: {original_fe.enrichment['Fishers_pvalue'].count()}")
print(f"number of significant pvals in clusters with all proteins added: {sqrt_fe.enrichment['Fishers_pvalue'].count()}")


print(f"")
print(f"")

::::::::::::::::::::::::::SQRT::::::::::::::::::::::::::

number of clusters that were updated with proteins: 15

number of functionally enriched clusters (modules) out of 164 original clusters: 152
number of functionally enriched clusters after addition of all proteins: 153

number of significant pvals in original clusters before updating: 496
number of significant pvals in clusters with all proteins added: 492




In [14]:
from recipe_utils import top_n_proteins
three_sqrt_qualifying_proteins = top_n_proteins(sqrt_qualifying_proteins, n=3)
three_sqrt_query_filepath = directory + "three_prot_clusters.txt"
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), three_sqrt_query_filepath, three_sqrt_qualifying_proteins)
sqrt_three_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
sqrt_three_fe.importFiles({'query': three_sqrt_query_filepath })
sqrt_three_fe.run(cluster=False)
print(f"number of functionally enriched clusters after addition of 3 proteins: {sqrt_three_fe.enrichment['Module'].nunique()}")


number of functionally enriched clusters after addition of 3 proteins: 153


In [15]:
half_qualifying_clusters, half_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2, use_sqrt=False, protein_ratio=.50, protein_constant=0)
quarter_qualifying_clusters, quarter_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2, use_sqrt=False, protein_ratio=.25, protein_constant=0)
tenth_qualifying_clusters, tenth_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2, use_sqrt=False, protein_ratio=.10, protein_constant=0)



In [16]:
half_query_filepath = directory + "half_clusters.txt"
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), half_query_filepath, half_qualifying_proteins)
half_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
half_fe.importFiles({'query': half_query_filepath })
half_fe.run(cluster=False)

print(f"number of functionally enriched clusters after addition of 1/2 proteins: {half_fe.enrichment['Module'].nunique()}")


quarter_query_filepath = directory + "quarter_clusters.txt"
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), quarter_query_filepath, quarter_qualifying_proteins)
quarter_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
quarter_fe.importFiles({'query': quarter_query_filepath })
quarter_fe.run(cluster=False)

print(f"number of functionally enriched clusters after addition of 1/4 proteins: {quarter_fe.enrichment['Module'].nunique()}")


tenth_query_filepath = directory + "tenth_clusters.txt"
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), tenth_query_filepath, tenth_qualifying_proteins)
tenth_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms)
tenth_fe.importFiles({'query': tenth_query_filepath })
tenth_fe.run(cluster=False)

print(f"number of functionally enriched clusters after addition of 1/10 proteins: {tenth_fe.enrichment['Module'].nunique()}")


print(half_qualifying_proteins)
print(quarter_qualifying_proteins)  
print(tenth_qualifying_proteins)


number of functionally enriched clusters after addition of 1/2 proteins: 152
number of functionally enriched clusters after addition of 1/4 proteins: 153
number of functionally enriched clusters after addition of 1/10 proteins: 156
{}
{89: ['CSNK2A1'], 91: ['SRC'], 104: ['SRC'], 140: ['CSNK2A1'], 156: ['PRKACA']}
{1: ['ATM', 'CSNK2A1', 'PRKCA', 'SRC'], 2: ['TYK2', 'IRAK1', 'JAK1', 'PRKCA'], 3: ['PPP2CB', 'CSNK2A1', 'CDK1'], 5: ['CSNK2A1', 'CDK1', 'PRKACA'], 6: ['CDK16', 'NR3C1', 'CSNK1A1', 'PRKACA'], 7: ['MAPK3'], 8: ['SMAD2', 'SMAD3', 'CDK2', 'GSK3B', 'PRKACA', 'PRKCA', 'SRC'], 9: ['AKT1', 'SRC'], 11: ['CHEK2', 'MYC', 'PRKCD', 'CASP3', 'CDK2', 'EGFR', 'SRC'], 12: ['EP300'], 13: ['PPP2CA', 'CASP3'], 14: ['MAPK3'], 16: ['MAPK1'], 18: ['CDK2', 'SRC'], 19: ['CBL', 'PTPN11'], 20: ['MAPK9', 'MAPK14', 'PRKACA'], 21: ['SRC'], 22: ['RAC1', 'PRKACA'], 23: ['AKT1', 'MAPK1', 'PRKACA', 'PRKCA'], 24: ['PRKCA'], 25: ['CSNK2A1', 'MAPK1'], 26: ['MAPK1'], 27: ['CASP3', 'AKT1', 'SRC'], 28: ['F2', 'CDC25

In [17]:
three_tenth_qualifying_proteins = top_n_proteins(tenth_qualifying_proteins, n=3)
three_tenth_query_filepath = directory + "three_tenth_clusters.txt"
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), three_tenth_query_filepath, three_tenth_qualifying_proteins)
half_tenth_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms) 
half_tenth_fe.importFiles({'query': three_tenth_query_filepath })
half_tenth_fe.run(cluster=False)
print(f"number of functionally enriched clusters after addition of 3 1/10 proteins: {half_tenth_fe.enrichment['Module'].nunique()}")


# three_half_qualifying_proteins = top_n_proteins(half_qualifying_proteins, n=3)
# three_half_query_filepath = directory + "three_half_clusters.txt"
# print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), three_half_query_filepath, three_half_qualifying_proteins)
# half_three_fe = get_initialized_fe(background_proteins_filepath, term_mapping_filepath, termlist = background_go_terms) 
# half_three_fe.importFiles({'query': three_half_query_filepath })
# half_three_fe.run(cluster=False)
# print(f"number of functionally enriched clusters after addition of 3 1/2 proteins: {half_three_fe.enrichment['Module'].nunique()}")



number of functionally enriched clusters after addition of 3 1/10 proteins: 153
