# Export Adjacency Matrix for MRF

In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
from linker.models import Analysis
from linker.views import get_last_data
from linker.constants import *


        MATCH (n:Species) RETURN n.displayName AS name order by name        
        

            MATCH (tp:TopLevelPathway)-[:hasEvent*]->(p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent)
            WHERE
                tp.displayName = 'Metabolism' AND
                tp.speciesName IN {species_list} AND
                (p)-[:hasEvent]->(rle)
            RETURN DISTINCT
                p.speciesName AS species_name,            
                p.displayName AS pathway_name,
                p.stId AS pathway_id                       
            ORDER BY species_name, pathway_name
        


### Load the analysis from django

In [3]:
analysis_id = 123
analysis = Analysis.objects.get(pk=analysis_id)

In [4]:
data = {}
for k, v in DataRelationType:
    try:
        analysis_data = get_last_data(analysis, k)
        data[k] = pd.DataFrame(analysis_data.json_data)
    except IndexError:
        continue
    except KeyError:
        continue

In [5]:
gene_df = data[GENOMICS]
protein_df = data[PROTEOMICS]
compound_df = data[METABOLOMICS]
gene_2_protein_df = data[GENES_TO_PROTEINS]
protein_2_reaction_df = data[PROTEINS_TO_REACTIONS]
compound_2_reaction_df = data[COMPOUNDS_TO_REACTIONS]
reaction_2_pathway_df = data[REACTIONS_TO_PATHWAYS]

### Some useful functions

In [6]:
def df_to_dict(df, key_col, value_col):
    return {k: set(g[value_col].tolist()) for k,g in df.groupby(key_col)}

In [7]:
def pathway_to_gene(gene_df, gene_2_protein_df, protein_2_reaction_df, reaction_2_pathway_df):
    df = pd.merge(gene_df, gene_2_protein_df, how='inner', on='gene_pk')
    df = pd.merge(df, protein_2_reaction_df, how='inner', on='protein_pk')
    df = pd.merge(df, reaction_2_pathway_df, how='inner', on='reaction_pk')
    df = df[['gene_pk', 'pathway_pk']].replace('-', np.nan).dropna()
    result = df_to_dict(df, 'pathway_pk', 'gene_pk')
    return result

In [8]:
def pathway_to_protein(protein_df, protein_2_reaction_df, reaction_2_pathway_df):
    df = pd.merge(protein_df, protein_2_reaction_df, how='inner', on='protein_pk')
    df = pd.merge(df, reaction_2_pathway_df, how='inner', on='reaction_pk')
    df = df[['protein_pk', 'pathway_pk']].replace('-', np.nan).dropna()
    result = df_to_dict(df, 'pathway_pk', 'protein_pk')
    return result

In [9]:
def pathway_to_compound(compound_df, compound_2_reaction_df, reaction_2_pathway_df):
    df = pd.merge(compound_df, compound_2_reaction_df, how='inner', on='compound_pk')
    df = pd.merge(df, reaction_2_pathway_df, how='inner', on='reaction_pk')
    df = df[['compound_pk', 'pathway_pk']].replace('-', np.nan).dropna()
    result = df_to_dict(df, 'pathway_pk', 'compound_pk')
    return result

In [10]:
def gene_to_reaction(gene_df, gene_2_protein_df, protein_2_reaction_df):
    df = pd.merge(gene_df, gene_2_protein_df, how='inner', on='gene_pk')
    df = pd.merge(df, protein_2_reaction_df, how='inner', on='protein_pk')
    df = df[['gene_pk', 'reaction_pk']].replace('-', np.nan).dropna()
    result = df_to_dict(df, 'gene_pk', 'reaction_pk')
    return result

In [11]:
def compound_to_reaction(compound_df, compound_2_reaction_df, reaction_2_pathway_df):
    df = pd.merge(compound_df, compound_2_reaction_df, how='inner', on='compound_pk')
    df = df[['compound_pk', 'reaction_pk']].replace('-', np.nan).dropna()
    result = df_to_dict(df, 'compound_pk', 'reaction_pk')
    return result

In [12]:
pathway_to_gene_dict = pathway_to_gene(gene_df, gene_2_protein_df, protein_2_reaction_df, reaction_2_pathway_df)

In [13]:
protein_2_pathway_dict = pathway_to_protein(protein_df, protein_2_reaction_df, reaction_2_pathway_df)

In [14]:
compound_2_pathway_dict = pathway_to_compound(compound_df, compound_2_reaction_df, reaction_2_pathway_df)

In [15]:
gene_to_reaction_dict = gene_to_reaction(gene_df, gene_2_protein_df, protein_2_reaction_df)

In [16]:
compound_to_reaction_dict = compound_to_reaction(compound_df, compound_2_reaction_df, reaction_2_pathway_df)

### Generate adjacency matrices for each pathway

For each pathway $k$ we compute $D_k$, where $D_k$ is the number of entities in pathway $k$ found under the same reaction.

Since we don't have protein data, we assume that the presence of one gene corresponds to the protein it produces.

In [17]:
all_pathways = set(pathway_to_gene_dict.keys()) | set(compound_2_pathway_dict.keys())

In [18]:
gene_comparison_col = 'padj_INFEC_vs_UN'
compound_comparison_col = 'padj_INFEC_vs_UN'
pval_threshold = 0.15

In [19]:
significant_genes = set(gene_df[gene_df[gene_comparison_col] < pval_threshold]['gene_pk'].tolist())
significant_compounds = set(compound_df[compound_df[compound_comparison_col] < pval_threshold]['compound_pk'].tolist())
print(len(significant_genes))
print(len(significant_compounds))

2499
6
