# MSstats Integration with INDRA Demo

This Python notebook serves as an example of how users can use their own datasets and integrate them with INDRA to provide meaningful interpretation.  We start with bringing our own MSstats dataset that consists of a list of proteins alongside their p-values, logFCs, and abundances.  We use p-values, logFCs, and correlations between proteins to filter which genes we query INDRA with.

## STEP 1: Import MSstats Dataset

First, we will import MSstats datasets as pandas dataframes.  These MSstats datasets consist of the outputs of the dataProcess and groupComparison functions.  The dataProcess output consists of protein abundances while the groupComparison function consists of protein p-values and logFCs.

We filter the datasets to smaller sizes based on p-value and logFC values.  You can adjust those parameters as well.

In [1]:
import pandas as pd

P_VALUE_LOGFC_PATH = "groupComparisonOutput.csv" # Set this path yourself
LABELS_FILTER = ["DMSO-DbET6"]
P_VALUE_FILTER = 0.05 # Adjust this yourself

def construct_pvalue_logfc_df(filename):
    """Return a filtered data frame from the given data file."""
    pandas_df = pd.read_csv(filename)
    pandas_df = pandas_df[pandas_df['issue'].isnull()]
    pandas_df = pandas_df[pandas_df['adj.pvalue'] < P_VALUE_FILTER]
    pandas_df = pandas_df[pandas_df['Label'].isin(LABELS_FILTER)]
    return pandas_df

pvalue_logfc_df = construct_pvalue_logfc_df(P_VALUE_LOGFC_PATH)
pvalue_logfc_df

Unnamed: 0,Protein,Label,log2FC,SE,Tvalue,DF,pvalue,adj.pvalue,issue,MissingPercentage,ImputationPercentage
19890,BRD2_HUMAN,DMSO-DbET6,2.046185244,0.114339,17.895836,260.0,0.0,0.0,,0.310067,0.0
19935,BRD3_HUMAN,DMSO-DbET6,3.333427936,0.126571,26.336522,257.0,0.0,0.0,,0.252349,0.0
19980,BRD4_HUMAN,DMSO-DbET6,2.668934662,0.101283,26.351317,257.0,0.0,0.0,,0.118121,0.0
27900,CEBPZ_HUMAN,DMSO-DbET6,-0.291058829,0.07434,-3.915236,260.0,0.000115442,0.03005139,,0.05906,0.0
37530,CRNL1_HUMAN,DMSO-DbET6,-0.268053808,0.069816,-3.839419,260.0,0.000154958,0.03832114,,0.027517,0.0
41580,DAZP1_HUMAN,DMSO-DbET6,0.617508071,0.099537,6.203818,260.0,2.15995e-09,1.780518e-06,,0.286577,0.0
66060,FUBP2_HUMAN,DMSO-DbET6,0.291044501,0.077226,3.768747,260.0,0.000203053,0.04782372,,0.01745,0.0
66105,FUBP3_HUMAN,DMSO-DbET6,0.300409541,0.069882,4.298798,260.0,2.430235e-05,0.008941156,,0.095302,0.0
72855,GTPB4_HUMAN,DMSO-DbET6,-0.344028893,0.087806,-3.918077,260.0,0.000114166,0.03005139,,0.020134,0.0
75735,HEAT3_HUMAN,DMSO-DbET6,-0.409028514,0.100056,-4.08798,260.0,5.804139e-05,0.01794204,,0.127517,0.0


In [2]:
ABUNDANCE_PATH = "dataProcessOutput.csv" # Set this path yourself
ABUNDANCE_GROUPS_FILTER = ['DMSO', 'DbET6']
def construct_abundance_df(filename):
    pandas_df = pd.read_csv(filename)
    pandas_df = pandas_df[pandas_df['GROUP'].isin(ABUNDANCE_GROUPS_FILTER)]
    return pandas_df

protein_abundance_df = construct_abundance_df(ABUNDANCE_PATH)
protein_abundance_df

Unnamed: 0,RUN,Protein,LogIntensities,originalRUN,GROUP,SUBJECT,TotalGroupMeasurements,NumMeasuredFeature,MissingPercentage,more50missing,NumImputedFeature
0,1,1433B_HUMAN,12.873423,230719_THP-1_Chrom_end2end_Plate1_DMSO_A02_DIA,DMSO,2,1210,10,0.0,False,0
1,2,1433B_HUMAN,12.866217,230719_THP-1_Chrom_end2end_Plate1_DMSO_A05_DIA,DMSO,5,1210,10,0.0,False,0
2,3,1433B_HUMAN,12.686827,230719_THP-1_Chrom_end2end_Plate1_DMSO_A10_DIA,DMSO,10,1210,10,0.0,False,0
3,4,1433B_HUMAN,12.625462,230719_THP-1_Chrom_end2end_Plate1_DMSO_A12_DIA,DMSO,12,1210,10,0.0,False,0
4,5,1433B_HUMAN,12.538365,230719_THP-1_Chrom_end2end_Plate1_DMSO_B01_DIA,DMSO,13,1210,10,0.0,False,0
...,...,...,...,...,...,...,...,...,...,...,...
1189700,145,ZZZ3_HUMAN,10.725469,230719_THP-1_Chrom_end2end_Plate3_PF477736_D05,DbET6,233,169,10,0.0,False,0
1189701,146,ZZZ3_HUMAN,10.155338,230719_THP-1_Chrom_end2end_Plate3_DMSO_D06,DbET6,234,169,10,0.0,False,0
1189702,147,ZZZ3_HUMAN,9.700678,230719_THP-1_Chrom_end2end_Plate3_K975_D12,DbET6,240,169,10,0.0,False,0
1189703,148,ZZZ3_HUMAN,10.889323,230719_THP-1_Chrom_end2end_Plate3_VTP50469_F06,DbET6,258,169,10,0.0,False,0


## STEP 2: CORRELATION MATRIX

In [3]:
import numpy as np
def calculate_correlation_matrix(pvalue_df, protein_level_summary):
    data = {}
    subjects = protein_level_summary['SUBJECT'].unique()
    for protein in pvalue_df['Protein'].unique():
        data[protein] = []
        protein_level_df = protein_level_summary[protein_level_summary['Protein'] == protein]
        for subject in subjects:
            if subject in protein_level_df['SUBJECT'].values:
                protein_level_df_subject = protein_level_df[protein_level_df['SUBJECT'] == subject]
                data[protein].append(protein_level_df_subject['LogIntensities'].iloc[0])
            else:
                data[protein].append(np.nan)
    df = pd.DataFrame(data)
    corrM = df.corr() 
    return corrM

corr_matrix = calculate_correlation_matrix(pvalue_logfc_df, protein_abundance_df)
corr_matrix

Unnamed: 0,BRD2_HUMAN,BRD3_HUMAN,BRD4_HUMAN,CEBPZ_HUMAN,CRNL1_HUMAN,DAZP1_HUMAN,FUBP2_HUMAN,FUBP3_HUMAN,GTPB4_HUMAN,HEAT3_HUMAN,...,KRR1_HUMAN,NOP2_HUMAN,PP1R8_HUMAN,PRP18_HUMAN,QKI_HUMAN,TIA1_HUMAN,UTP15_HUMAN,WDR12_HUMAN,WDR43_HUMAN,WDR75_HUMAN
BRD2_HUMAN,1.0,0.797708,0.879262,0.01985,-0.207924,0.598796,0.515316,0.557954,-0.09377,-0.429264,...,-0.118297,-0.225047,0.621668,0.255623,0.479966,0.544447,-0.466654,-0.26525,-0.253804,-0.157479
BRD3_HUMAN,0.797708,1.0,0.866766,-0.271567,-0.104151,0.312657,0.307522,0.226802,-0.387214,-0.175765,...,-0.192796,-0.302674,0.422305,0.483052,0.660086,0.185024,-0.322859,-0.183338,-0.316502,-0.223405
BRD4_HUMAN,0.879262,0.866766,1.0,-0.027827,-0.155598,0.552971,0.43537,0.467304,-0.034894,-0.385975,...,-0.068881,-0.274753,0.472644,0.275248,0.499226,0.489967,-0.423467,-0.236744,-0.388817,-0.146948
CEBPZ_HUMAN,0.01985,-0.271567,-0.027827,1.0,0.329054,0.36781,0.363832,0.413635,0.765247,-0.066058,...,0.575716,0.447506,0.250952,-0.262162,-0.105441,0.483017,0.20818,0.236918,0.353536,0.377506
CRNL1_HUMAN,-0.207924,-0.104151,-0.155598,0.329054,1.0,-0.035805,0.107018,0.026768,0.230088,0.231355,...,0.396803,0.527981,-0.047657,0.219325,0.15109,-0.183671,0.600379,0.558789,0.56693,0.510267
DAZP1_HUMAN,0.598796,0.312657,0.552971,0.36781,-0.035805,1.0,0.608002,0.743681,0.329881,-0.311158,...,0.199983,0.006942,0.595334,0.0626,0.372729,0.751419,-0.268644,-0.070602,-0.132739,0.042042
FUBP2_HUMAN,0.515316,0.307522,0.43537,0.363832,0.107018,0.608002,1.0,0.610191,0.168864,-0.218615,...,0.282158,0.194035,0.513343,0.237327,0.469106,0.589631,-0.048102,0.109012,0.165,0.105673
FUBP3_HUMAN,0.557954,0.226802,0.467304,0.413635,0.026768,0.743681,0.610191,1.0,0.419721,-0.322835,...,0.156421,0.0709,0.604436,0.042284,0.303556,0.790524,-0.295023,-0.032122,0.020199,0.132255
GTPB4_HUMAN,-0.09377,-0.387214,-0.034894,0.765247,0.230088,0.329881,0.168864,0.419721,1.0,-0.044,...,0.48497,0.288195,0.11246,-0.407856,-0.330488,0.534385,0.066697,0.030891,0.189041,0.265675
HEAT3_HUMAN,-0.429264,-0.175765,-0.385975,-0.066058,0.231355,-0.311158,-0.218615,-0.322835,-0.044,1.0,...,0.199914,0.096674,-0.343605,0.141172,-0.025871,-0.320925,0.445552,0.254046,0.246794,-0.046507


## STEP 3: ID CONVERSION

At this step, we begin to interact with INDRA's interfaces.  

The dataset provided in the example above contains uniprot mnemonic IDs.  INDRA stores information based on HGNC ID.  Luckily, INDRA has code to convert uniprot mnemonic IDs into HGNC ids. For now, we will store this mapping in a separate dictionary

In [4]:
from indra.databases import uniprot_client

def uniprot_to_hgnc_id(uniprot_mnemonic):
    """Get an HGNC ID from a UniProt mnemonic."""
    uniprot_id = uniprot_client.get_id_from_mnemonic(uniprot_mnemonic)
    if uniprot_id:
        return uniprot_client.get_hgnc_id(uniprot_id)
    else:
        return None

uniprot_to_hgnc_id("BRD2_HUMAN")

  from .autonotebook import tqdm as notebook_tqdm


'1103'

In [5]:
def uniprot_to_hgnc_gene_name(uniprot_mnemonic):
    """Get an HGNC gene name from a UniProt mnemonic."""
    uniprot_id = uniprot_client.get_gene_name(uniprot_mnemonic)
    return uniprot_id
uniprot_to_hgnc_gene_name("BRD2_HUMAN")

'BRD2'

In [6]:
def create_uniprot_to_hgnc_id_mapping(pandas_df):
    mappings = {}
    for protein in pandas_df['Protein'].unique():
        mappings[protein] = uniprot_to_hgnc_id(protein)
    return mappings

uniprot_to_hgnc_id_mapping = create_uniprot_to_hgnc_id_mapping(pvalue_logfc_df)
uniprot_to_hgnc_id_mapping

{'BRD2_HUMAN': '1103',
 'BRD3_HUMAN': '1104',
 'BRD4_HUMAN': '13575',
 'CEBPZ_HUMAN': '24218',
 'CRNL1_HUMAN': '15762',
 'DAZP1_HUMAN': '2683',
 'FUBP2_HUMAN': '6316',
 'FUBP3_HUMAN': '4005',
 'GTPB4_HUMAN': '21535',
 'HEAT3_HUMAN': '26087',
 'HNRPD_HUMAN': '5036',
 'KRR1_HUMAN': '5176',
 'NOP2_HUMAN': '7867',
 'PP1R8_HUMAN': '9296',
 'PRP18_HUMAN': '17351',
 'QKI_HUMAN': '21100',
 'TIA1_HUMAN': '11802',
 'UTP15_HUMAN': '25758',
 'WDR12_HUMAN': '14098',
 'WDR43_HUMAN': '28945',
 'WDR75_HUMAN': '25725'}

In [7]:
def create_uniprot_to_hgnc_gene_name_mapping(pandas_df):
    mappings = {}
    for protein in pandas_df['Protein'].unique():
        mappings[protein] = uniprot_to_hgnc_gene_name(protein)
    return mappings

uniprot_to_hgnc_gene_name_mapping = create_uniprot_to_hgnc_gene_name_mapping(pvalue_logfc_df)
uniprot_to_hgnc_gene_name_mapping

{'BRD2_HUMAN': 'BRD2',
 'BRD3_HUMAN': 'BRD3',
 'BRD4_HUMAN': 'BRD4',
 'CEBPZ_HUMAN': 'CEBPZ',
 'CRNL1_HUMAN': 'CRNKL1',
 'DAZP1_HUMAN': 'DAZAP1',
 'FUBP2_HUMAN': 'KHSRP',
 'FUBP3_HUMAN': 'FUBP3',
 'GTPB4_HUMAN': 'GTPBP4',
 'HEAT3_HUMAN': 'HEATR3',
 'HNRPD_HUMAN': 'HNRNPD',
 'KRR1_HUMAN': 'KRR1',
 'NOP2_HUMAN': 'NOP2',
 'PP1R8_HUMAN': 'PPP1R8',
 'PRP18_HUMAN': 'PRPF18',
 'QKI_HUMAN': 'QKI',
 'TIA1_HUMAN': 'TIA1',
 'UTP15_HUMAN': 'UTP15',
 'WDR12_HUMAN': 'WDR12',
 'WDR43_HUMAN': 'WDR43',
 'WDR75_HUMAN': 'WDR75'}

## STEP 4: QUERY INDRA

With all of the information from MSstats regarding low p-value and highly correlated proteins, we can inspect INDRA for any evidence related to the discovered proteins

In [8]:
from indra.sources.indra_db_rest.api import get_statements_from_query
from indra.sources.indra_db_rest.query import HasAgent, HasType, HasEvidenceBound

EVIDENCE_BOUND_FILTER = 25 # Adjust this yourself
agentQuery = None
for value in uniprot_to_hgnc_gene_name_mapping.values():
    if not agentQuery:
        agentQuery = HasAgent(value)
    else:
        agentQuery = agentQuery | HasAgent(value)
query = ((agentQuery) & HasEvidenceBound([f'>= {EVIDENCE_BOUND_FILTER}']))
p = get_statements_from_query(query, sort_by = "belief")
p.statements[0]

INFO: [2024-04-18 18:49:02] indra_db_rest.query_processor - Retrieving statements that (have an agent where NAME=BRD2, have an agent where NAME=BRD3, have an agent where NAME=BRD4, have an agent where NAME=CEBPZ, have an agent where NAME=CRNKL1, have an agent where NAME=DAZAP1, have an agent where NAME=FUBP3, have an agent where NAME=GTPBP4, have an agent where NAME=HEATR3, have an agent where NAME=HNRNPD, have an agent where NAME=KHSRP, have an agent where NAME=KRR1, have an agent where NAME=NOP2, have an agent where NAME=PPP1R8, have an agent where NAME=PRPF18, have an agent where NAME=QKI, have an agent where NAME=TIA1, have an agent where NAME=UTP15, have an agent where NAME=WDR12, have an agent where NAME=WDR43, or have an agent where NAME=WDR75) and have >= 25 evidence.
INFO: [2024-04-18 18:49:02] indra_db_rest.request_logs - Running 0th request for statements
INFO: [2024-04-18 18:49:02] indra_db_rest.request_logs -   LIMIT: None
INFO: [2024-04-18 18:49:02] indra_db_rest.request_

Phosphorylation(None, DAZAP1())

In [9]:
from indra.assemblers.html import HtmlAssembler
ha = HtmlAssembler(p.statements,
                   title='INDRA subnetwork statements',
                   db_rest_url='https://db.indra.bio',
                   ev_counts=p.get_ev_counts(),
                   source_counts=p.get_source_counts())
html_str = ha.make_model()

INFO: [2024-04-18 18:49:08] indra.assemblers.html.assembler - Removing CHEBI from refs due to too many matches: {'CHEBI:95080', 'CHEBI:137113'}


In [10]:
from IPython.core.display import HTML
# HTML(html_str)

You can also query for drug information too.  In this example notebook, we query dBET6 to see what existing interactions are known of in INDRA.

In [11]:
# Query for https://db.indra.bio/statements/from_agents?subject=dBET6&format=html
from indra.sources.indra_db_rest.api import get_statements_from_query
from indra.sources.indra_db_rest.query import HasAgent
query = HasAgent("dBET6")
p = get_statements_from_query(query, sort_by = "belief")

ha = HtmlAssembler(p.statements,
                   title='INDRA subnetwork statements',
                   db_rest_url='https://db.indra.bio',
                   ev_counts=p.get_ev_counts(),
                   source_counts=p.get_source_counts())
html_str = ha.make_model()
# HTML(html_str)

INFO: [2024-04-18 18:49:08] indra_db_rest.query_processor - Retrieving statements that have an agent where NAME=dBET6.
INFO: [2024-04-18 18:49:08] indra_db_rest.request_logs - Running 0th request for statements
INFO: [2024-04-18 18:49:08] indra_db_rest.request_logs -   LIMIT: None
INFO: [2024-04-18 18:49:08] indra_db_rest.request_logs -   OFFSET: 0


## STEP 5: INDRA SUBNETWORK RELATIONS

Using INDRA Cogex, we can extract subnetwork relationships among the proteins from the MSstats output.

In [12]:
import requests

def query_indra_subnetwork(groundings):
    """Return a list INDRA subnetwork relations based on a list of groundings."""
    res = requests.post(
        'https://discovery.indra.bio/api/indra_subnetwork_relations',
        json={'nodes': groundings}
    )
    return res.json()

In [13]:
groundings = []
for hgnc_id in uniprot_to_hgnc_id_mapping.values():
    groundings.append(('HGNC', hgnc_id))
subnetwork_relations = query_indra_subnetwork(groundings)
subnetwork_relations[0]

{'data': {'belief': 0.65,
  'evidence_count': 1,
  'has_database_evidence': False,
  'has_reader_evidence': True,
  'has_retracted_evidence': False,
  'medscan_only': False,
  'source_counts': '{"sparser": 1}',
  'sparser_only': True,
  'stmt_hash': 6100415255007272,
  'stmt_json': '{"type": "Complex", "members": [{"name": "RAD21", "db_refs": {"UP": "O60216", "TEXT": "RAD21", "HGNC": "9811", "EGID": "5885"}}, {"name": "BRD2", "db_refs": {"UP": "P25440", "TEXT": "BRD2", "HGNC": "1103", "EGID": "6046"}}, {"name": "BRD4", "db_refs": {"UP": "O60885", "TEXT": "BRD4", "HGNC": "13575", "EGID": "23476"}}], "belief": 0.65, "evidence": [{"source_api": "sparser", "pmid": "28107481", "text": "However, we were unable to demonstrate any direct physical interaction between BRD2 or BRD4 with cohesin subunit RAD21 ( xref ), suggesting the conformational control of KSHV latency involves additional factors.", "annotations": {"found_by": "INTERACT"}, "text_refs": {"PMID": "28107481", "TRID": 16352739, "PM

In [14]:
import json
from indra.statements import stmts_from_json

# Gather statistics for HTML presentation
unique_stmts = {entry['data']['stmt_hash']: json.loads(entry['data']['stmt_json'])
                for entry in subnetwork_relations}
ev_counts_by_hash = {entry['data']['stmt_hash']: entry['data']['evidence_count']
                     for entry in subnetwork_relations}
source_counts_by_hash = {entry['data']['stmt_hash']: json.loads(entry['data']['source_counts'])
                         for entry in subnetwork_relations}
stmts = stmts_from_json(list(unique_stmts.values()))

In [15]:
ha = HtmlAssembler(stmts,
                   title='INDRA subnetwork statements',
                   db_rest_url='https://db.indra.bio',
                   ev_counts=ev_counts_by_hash,
                   source_counts=source_counts_by_hash)
html_str = ha.make_model()

In [16]:
from IPython.core.display import HTML
# HTML(html_str)

## STEP 6: NETWORK VISUALIZATION

We can also visualize the subnetwork acquired from INDRA Cogex using an INDRA built-in assembler

In [17]:
from indra.assemblers.graph.assembler import GraphAssembler

ga = GraphAssembler(stmts=stmts)
ga.make_model()
ga.save_pdf(file_name='graph.pdf', prog='dot')

## STEP 7: TABULAR FORMAT

INDRA also has an assembler for displaying INDRA cogex results as a table.

In [18]:
from indra.assemblers.indranet.assembler import IndraNetAssembler

def add_evidence_column(stmt, ev_counts = ev_counts_by_hash):
    hash = stmt.get_hash(refresh=True)
    return ev_counts[hash]

indra_net_assembler = IndraNetAssembler(statements=stmts)
relations_table = indra_net_assembler.make_df(extra_columns=[('evidence_count', add_evidence_column)])
relations_table = relations_table.sort_values(by=['evidence_count'], ascending=False)
relations_table

Unnamed: 0,agA_name,agB_name,agA_ns,agA_id,agB_ns,agB_id,residue,position,stmt_type,evidence_count,stmt_hash,belief,source_counts,initial_sign
119,BRD4,BRD2,HGNC,13575,HGNC,1103,,,Complex,113,12484535149707124,0.999950,{'reach': 1},
118,BRD2,BRD4,HGNC,1103,HGNC,13575,,,Complex,113,12484535149707124,0.999950,{'reach': 1},
231,BRD4,BRD3,HGNC,13575,HGNC,1104,,,Complex,51,359233681958482,0.999975,{'sparser': 1},
232,BRD3,BRD4,HGNC,1104,HGNC,13575,,,Complex,51,359233681958482,0.999975,{'sparser': 1},
151,BRD2,BRD3,HGNC,1103,HGNC,1104,,,Complex,42,-16515703320827288,0.999950,{'sparser': 1},
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,JMJD6,BRD2,HGNC,19355,HGNC,1103,,,Complex,1,34194495135899326,0.650000,{'sparser': 1},
158,JMJD6,BRD3,HGNC,19355,HGNC,1104,,,Complex,1,34194495135899326,0.650000,{'sparser': 1},
159,KRR1,NOP2,HGNC,5176,HGNC,7867,,,Complex,1,-24130039704357981,0.980000,{'biogrid': 1},
160,NOP2,KRR1,HGNC,7867,HGNC,5176,,,Complex,1,-24130039704357981,0.980000,{'biogrid': 1},


## ADDITIONAL: ADHOC VISUALIZATIONS WITH MSSTATS INFORMATION

In [19]:
LOG_FC_FILTER = 0.25
pvalue_logfc_df['log2FC'] = pvalue_logfc_df['log2FC'].astype(float)
logfc_proteins = pvalue_logfc_df[pvalue_logfc_df['log2FC'] > LOG_FC_FILTER]
logfc_proteins

Unnamed: 0,Protein,Label,log2FC,SE,Tvalue,DF,pvalue,adj.pvalue,issue,MissingPercentage,ImputationPercentage
19890,BRD2_HUMAN,DMSO-DbET6,2.046185,0.114339,17.895836,260.0,0.0,0.0,,0.310067,0.0
19935,BRD3_HUMAN,DMSO-DbET6,3.333428,0.126571,26.336522,257.0,0.0,0.0,,0.252349,0.0
19980,BRD4_HUMAN,DMSO-DbET6,2.668935,0.101283,26.351317,257.0,0.0,0.0,,0.118121,0.0
41580,DAZP1_HUMAN,DMSO-DbET6,0.617508,0.099537,6.203818,260.0,2.15995e-09,2e-06,,0.286577,0.0
66060,FUBP2_HUMAN,DMSO-DbET6,0.291045,0.077226,3.768747,260.0,0.000203053,0.047824,,0.01745,0.0
66105,FUBP3_HUMAN,DMSO-DbET6,0.30041,0.069882,4.298798,260.0,2.430235e-05,0.008941,,0.095302,0.0
137745,PP1R8_HUMAN,DMSO-DbET6,0.358282,0.069056,5.188314,260.0,4.275951e-07,0.000235,,0.045638,0.0
140535,PRP18_HUMAN,DMSO-DbET6,0.485713,0.120799,4.020834,260.0,7.603588e-05,0.022122,,0.049664,0.0
145440,QKI_HUMAN,DMSO-DbET6,0.336985,0.05513,6.112567,260.0,3.570594e-09,3e-06,,0.0,0.0
196155,TIA1_HUMAN,DMSO-DbET6,0.476791,0.105361,4.525306,260.0,9.182867e-06,0.004129,,0.058389,0.0


In [21]:
CORRELATION_FILTER = 0.3
EVIDENCE_FILTER = 10

for node in ga.graph.nodes():
    node_properties = {
        'color': '#808080',
        'shape': 'Mrecord',
        'fontsize': 8
    }
    if node in logfc_proteins['Protein'].apply(lambda protein_id: uniprot_to_hgnc_gene_name_mapping[protein_id]):
        node_properties['color'] = '#00FF00'
        uniprot_id = list(uniprot_to_hgnc_gene_name_mapping.keys())[list(uniprot_to_hgnc_gene_name_mapping.values()).index(node)]
        logFC_value = round(pvalue_logfc_df[pvalue_logfc_df['Protein'] == uniprot_id]['log2FC'].iloc[0], 2)
        ga.graph.add_node(node,
                          label=f'{node}: {logFC_value} LogFC',
                          **node_properties)
    if node not in uniprot_to_hgnc_gene_name_mapping.values():
        ga.graph.add_node(node,
                          label=node,
                          **node_properties)
    else:
        node_properties['color'] = '#FF0000'
        uniprot_id = list(uniprot_to_hgnc_gene_name_mapping.keys())[list(uniprot_to_hgnc_gene_name_mapping.values()).index(node)]
        logFC_value = round(pvalue_logfc_df[pvalue_logfc_df['Protein'] == uniprot_id]['log2FC'].iloc[0], 2)
        ga.graph.add_node(node,
                          label=f'{node}: {logFC_value} LogFC',
                          **node_properties)

color = '#ff0000'
color_default = '#000000'
for edge in ga.graph.edges():
    params = {'color': color_default,
              'arrowhead': 'normal',
              'dir': 'forward'}
    if edge[0] in uniprot_to_hgnc_gene_name_mapping.values() and \
        edge[1] in uniprot_to_hgnc_gene_name_mapping.values():
        uniprot_0 = list(uniprot_to_hgnc_gene_name_mapping.keys())[list(uniprot_to_hgnc_gene_name_mapping.values()).index(edge[0])]
        uniprot_1 = list(uniprot_to_hgnc_gene_name_mapping.keys())[list(uniprot_to_hgnc_gene_name_mapping.values()).index(edge[1])]
        correlation = round(corr_matrix[uniprot_0][uniprot_1], 2)
        evidence_df = relations_table[relations_table['agA_name'] == edge[0]]
        evidence_df = evidence_df[evidence_df['agB_name'] == edge[1]]
        evidence = evidence_df['evidence_count'].iloc[0]
        if evidence >= EVIDENCE_FILTER and correlation > CORRELATION_FILTER:
            params['color'] = '#00ff00'
        else:
            params['color'] = color
        params['label'] = f'Correlation: {correlation}, Evidence: {evidence}'
    ga.graph.add_edge(edge[0], edge[1], **params)

ga.save_pdf(file_name='graph2.pdf', prog='dot')