
# d3_ic50_small
- this file is for dream3 data, with small clusters generated using the ic_50 parameter (spectral clustering)
- we consider two different cuttoffs for adding proteins back:
1. Squareroot method: a protein must connect at least sqrt(num components) components to qualify for addition to a cluster
2. linear method: a protein must connect at least 0.5 * num components componetns to qualify for addition to a cluster


In [5]:
# imports:

# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# library used for functional enrichment analysis
from func_e.FUNC_E import FUNC_E  # a method that can do funtional enrichment analysis!
import func_e.vocabs.all as vocabs
# following code is necessary as some functions used in the funcE class produce future warnings but i can't fix it as it is not my code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# importing the custom classes i built
from matrix_class import ProteinMatrix # ppi matrix 
from cluster_class import AllClusters # dictionary to hold all clusters (in form number of cluster : list of proteins in that cluster)
from degreelist_class import DegreeList # creates a list of all proteins in order of their degree

# helper functions for setting up program
from recipe_utils import initialize_matrix_clusters_degreelist
from recipe_utils import find_clusters_and_proteins_together

# helper functions for functional enrichment
from recipe_utils import print_querylist_of_clusters_to_file
from recipe_utils import print_protein_background_to_file
from recipe_utils import create_term_mapping_list
from recipe_utils import get_initialized_fe

In [6]:
matrix, clusters, degreelist = initialize_matrix_clusters_degreelist(interactions_filepath="data/interactions/3_signal_omnipath_directed_v3.txt", clusters_filepath="data/d3_ic50_small/dream3_small-cluster.json")

In [8]:
linear_qualifying_clusters, linear_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=False, protein_ratio=.5, protein_constant=0)
linear_qualifying_proteins

{0: ['CDK2'],
 1: ['PRKCA'],
 2: ['NGFR',
  'NFKBIB',
  'TMEM173',
  'TICAM1',
  'TBK1',
  'IRAK1',
  'MAVS',
  'NFKBIA',
  'CHUK',
  'RELA',
  'IKBKB',
  'TRAF6',
  'MAPK8',
  'TP53'],
 3: ['AR'],
 5: ['TP53'],
 6: ['TP53', 'MAPK3', 'CDK1', 'SRC'],
 7: ['PRKACA'],
 8: ['CSK',
  'PTPN11',
  'NFE2L2',
  'CREBBP',
  'LYN',
  'GRB2',
  'FYN',
  'MAPK1',
  'SRC'],
 10: ['HIST1H3B'],
 12: ['ATM'],
 13: ['SRC'],
 14: ['ABL1'],
 15: ['CSNK2A1'],
 16: ['AKT1'],
 17: ['EGFR'],
 18: ['CDK1'],
 19: ['SRC'],
 20: ['TP53'],
 21: ['SRC'],
 22: ['AURKB'],
 29: ['CBL'],
 30: ['STK4'],
 31: ['GSK3B'],
 32: ['SP1'],
 33: ['LCK', 'FYN'],
 34: ['ATM'],
 35: ['PRKACA', 'PRKCA'],
 36: ['LRP5', 'FZD3', 'LRP6'],
 37: ['CSNK2A1'],
 38: ['PLK1'],
 39: ['MAPK1'],
 46: ['CSNK2A1'],
 47: ['AKT1', 'PRKCA'],
 48: ['PRKACA'],
 49: ['PRKCA', 'SRC'],
 50: ['ESR1'],
 51: ['PRKCD'],
 52: ['ABL1'],
 67: ['PRKACA'],
 68: ['SRC'],
 69: ['PRKCA'],
 70: ['E2F1', 'AR', 'CDK2', 'CSNK2A1', 'CDK1'],
 71: ['RIPK3',
  'AZI2',
  'TN

In [9]:
sqrt_qualifying_clusters, sqrt_qualifying_proteins = find_clusters_and_proteins_together(matrix, clusters, degreelist, cluster_ratio=0, cluster_constant=2,use_sqrt=True, protein_ratio=1, protein_constant=0)
sqrt_qualifying_proteins

{0: ['CDK2'],
 1: ['PRKCA'],
 2: ['NGFR',
  'NFKBIB',
  'TMEM173',
  'TICAM1',
  'TBK1',
  'IRAK1',
  'MAVS',
  'NFKBIA',
  'CHUK',
  'RELA',
  'IKBKB',
  'TRAF6',
  'MAPK8',
  'TP53'],
 3: ['AR'],
 4: ['MAP2K4', 'MAPK14', 'GSK3B'],
 5: ['TP53'],
 6: ['CHEK2',
  'MDM2',
  'JUN',
  'FYN',
  'ABL1',
  'ATM',
  'CDK2',
  'MAPK14',
  'TP53',
  'MAPK3',
  'AKT1',
  'CSNK2A1',
  'MAPK1',
  'CDK1',
  'SRC'],
 7: ['PRKACA'],
 8: ['CSK',
  'PTPN11',
  'NFE2L2',
  'CREBBP',
  'LYN',
  'GRB2',
  'FYN',
  'MAPK1',
  'SRC'],
 9: ['ATR'],
 10: ['HIST1H3B'],
 11: ['CASP8', 'CASP3'],
 12: ['ATR', 'ATM', 'CDK2', 'CSNK2A1'],
 13: ['SRC'],
 14: ['ABL1'],
 15: ['CSNK2A1'],
 16: ['AKT1'],
 17: ['EGFR'],
 18: ['CDK1'],
 19: ['SRC'],
 20: ['PCNA', 'E2F1', 'CDK2', 'TP53', 'CDK1'],
 21: ['SRC'],
 22: ['AURKB'],
 29: ['CBL', 'EGFR'],
 30: ['STK4'],
 31: ['GSK3B'],
 32: ['SP1'],
 33: ['ZAP70', 'SYK', 'LCK', 'FYN'],
 34: ['ATM'],
 35: ['PRKACA', 'PRKCA'],
 36: ['WIF1', 'FZD6', 'ROR2', 'LRP5', 'FZD3', 'LRP6'],
 37

In [13]:
# use functions to print info to files: (annoying, but need to get info into format used for funcE package)
# 1) a background protein list to be used for functional enrichment analysis
background_proteins_filepath = "data/d3_ic50_small/background_proteinlist.txt"
print_protein_background_to_file(matrix, background_proteins_filepath) 

# 2) file containing protein to go term
term_mapping_filepath = "data/go_labels/dream3_term_mapping.txt"
create_term_mapping_list("data/go_labels/dream3_go.tsv", term_mapping_filepath)

# # 3) not a file, but need to establish a background of all GO terms! (to be used for functional enrichment analysis)
# background_go_terms = vocabs.getTerms(['GO'])

# # continue printing info to files to be used in functional enrichment analysis:

# # 4) print original clusters to a file
# og_query_filepath = 'data/ic_50_medium/original_clusters.txt'
# print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), og_query_filepath)

# # 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
# sqrt_query_filepath = 'data/ic_50_medium/sqrt_clusters.txt'
# print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), sqrt_query_filepath, sqrt_qualifying_proteins)

# linear_query_filepath = 'data/ic_50_medium/linear_clusters.txt'
# print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), linear_query_filepath, linear_qualifying_proteins)

In [11]:
# 5) print updated clusters to a file (just append the function to include the dictionary of qualifying proteins)
sqrt_query_filepath = 'data/d3_ic50_small/sqrt_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), sqrt_query_filepath, sqrt_qualifying_proteins)

linear_query_filepath = 'data/d3_ic50_small/linear_clusters.txt'
print_querylist_of_clusters_to_file(clusters, clusters.get_all_cluster_labels(), linear_query_filepath, linear_qualifying_proteins)