# Statistics per BEL File

This notebook calculates statistics for each BEL file about the type of entities and relationships.

In [1]:
import os
import sys
import time

import pybel
import pandas as pd

import covid19kg

### Notebook Provenance
The explicit display of time of execution and the versions of the software packages used.

In [2]:
sys.version

'3.7.4 (v3.7.4:e09359112e, Jul  8 2019, 14:54:52) \n[Clang 6.0 (clang-600.0.57)]'

In [3]:
time.asctime()

'Sun Apr 12 09:44:42 2020'

In [4]:
pybel.get_version()

'0.14.6'

In [5]:
HERE = os.path.abspath(os.path.dirname('__file__'))

BEL_DIRECTORY = os.path.abspath(os.path.join(HERE, os.pardir, os.pardir, 'covid19kg', 'covid19kg'))

def get_cached_bel_files():
    """Return all cached bel files."""
    return [
        filename
        for filename in os.listdir(BEL_DIRECTORY)
        if filename.endswith(".bel.nodelink.json")
    ]

In [6]:
def get_bel_types(bel_path):
    """Get BEL node and edge type statistics.
    
    :param str path: path to BEL file (cached)
    :return: count of all nodes and edges in a BEL graph
    :rtype: dict
    """
    bel_stats = {}

    bel_graph = pybel.from_nodelink_file(bel_path)
    
    bel_stats['nodes'] = bel_graph.number_of_nodes()
    bel_stats['edges'] = bel_graph.number_of_edges()

    # Get count of all BEL function types
    bel_functions_dict = pybel.struct.summary.count_functions(bel_graph)
    bel_stats.update(bel_functions_dict)

    # Get count of all BEL edge types
    bel_edges_dict = pybel.struct.summary.edge_summary.count_relations(bel_graph)
    bel_stats.update(bel_edges_dict)

    return bel_stats

In [7]:
BEL_STATS_COLUMN_NAMES = {
    'nodes': '# Nodes',
    'edges': '# Edges',
    'Protein': '# Proteins',
    'Gene': '# Genes',
    'RNA': '# RNAs',
    'Complex': '# Complexes',
    'Abundance': '# Compounds',
    'BiologicalProcess': '# Biological Processes',
    'Composite': '# Composites',
    'Pathology':'# Pathologies',
    'Reaction': '# Reactions',
    'increases': '# Increase Relations',
    'decreases': '# Decrease Relations',
    'association': '# Association Relations',
    'regulates': '# Regulates Relations',
    'directlyIncreases': '# Directly Increase Relations',
    'directlyDecreases': '# Directly Decreases Relations',    
    'isA': '# isA Relations',
    'partOf': '# partOf Relations',
    'hasComponent': '# Component Relations',
    'hasVariant': '# Variant Relations',
    'hasReactant': '# Reactants Relations',
    'hasProduct': '# Products Relations',
    'positiveCorrelation': '# Positive Correlation Relations',
    'negativeCorrelation': '# Negative Correlation Relations',
    'causesNoChange': '# CauseNoChange Relations',
    'prognosticBiomarkerFor': '# PrognosticBiomarkerFor Relations',
    'biomarkerFor': '# BiomarkerFor Relations',
}

In [8]:
def get_bel_stats(resource_folder):
    """Get all BEL node and edge type statistics.
    :param str resource_folder: path to BEL folder
    :return: count of all nodes and edges in all BEL graphs from one resource
    :rtype: dict
    """
    df = pd.DataFrame()
    
    for filename in sorted(get_cached_bel_files()):

        bel_statistics_dict = get_bel_types(os.path.join(BEL_DIRECTORY, filename))

        all_bel_statistics = {
            BEL_STATS_COLUMN_NAMES[key]: value
            for key, value in bel_statistics_dict.items()
        }
        
        # Add statistic for the given file as a new row
        bel_file_stats = pd.DataFrame(
            all_bel_statistics,
            index=[filename.strip('.bel.nodelink.json')],
            columns=BEL_STATS_COLUMN_NAMES.values(),
            dtype=int,
        )

        df = df.append(bel_file_stats.fillna(0).astype(int))

    return df

In [9]:
covid19_kg_stats_df = get_bel_stats(BEL_DIRECTORY)

In [10]:
covid19_kg_stats_df.head()

Unnamed: 0,# Nodes,# Edges,# Proteins,# Genes,# RNAs,# Complexes,# Compounds,# Biological Processes,# Composites,# Pathologies,...,# partOf Relations,# Component Relations,# Variant Relations,# Reactants Relations,# Products Relations,# Positive Correlation Relations,# Negative Correlation Relations,# CauseNoChange Relations,# PrognosticBiomarkerFor Relations,# BiomarkerFor Relations
15194747,9,13,5,0,0,3,1,0,0,0,...,6,0,1,0,0,0,0,0,0,0
15294014,19,30,15,0,0,1,0,3,0,0,...,2,0,4,0,0,0,6,0,0,0
15878679,11,32,9,0,0,0,1,0,0,1,...,0,0,5,0,0,16,0,0,0,0
15916886,11,15,5,0,0,0,1,4,0,1,...,0,0,1,0,0,0,0,0,0,0
16014971,14,14,6,0,6,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
covid19_kg_stats_df.to_csv("covid19_kg_stats.tsv", sep='\t')