# Olivia network builder

This notebook requires Olivia, so we make sure to have the installed Olivia dependencies

In [1]:
!pip install -r requirements.txt



In [1]:
import sys
sys.path.append('../../olivia')

from olivia.model import OliviaNetwork

## Graph builder

Auxiliary function to generate a graph from the structure of list of links stored in a CSV file

In [3]:
from time import time
import os
from typing import Any
import psutil
import pandas as pd
import networkx as nx


def add_chunk(
    df, G, dependent_field, dependency_field,
    filter_field=None,
    filter_value=None
):
    """ Utility method for build_dependency_network"""

    filtered = df[df[filter_field] == filter_value] if filter_field else df
    links = list(zip(filtered[dependency_field], filtered[dependent_field]))
    G.add_edges_from(links)
    return G

def build_dependency_network(
    input_file,
    output_file,
    chunk_size,
    dependent_field: str = 'Project Name',
    dependency_field: str = 'Dependency Name',
    filter_field = None,
    filter_value = None,
    verbose: bool = True
) -> None:

    """
    Builds a dependency network from a file with package dependencies information

    Reads from a CSV file and writes to a txt file with adjacency lists
    corresponding to network model. Compression methods are inferred from file
    extension (.gz and .bz2 are supported from NetworkX IO methods)

    Parameters
    ----------
    input_file : str
        Path to csv file with dependencies information
    output_file : str
        Path to write resulting network file
    chunk_size : int
        Amount of lines to be read at once from input_file in batch  processing.
    dependent_field : str
        DataFrame column Id for the dependent package
    dependency_field : str
        Dataframe column Id for the dependency package
    filter_field : str, optional
        If not None, only add records where filter_field equals filter_value
    filter_value : str, optional
        If not None, only add records where filter_field equals filter_value
    verbose: bool, optional
        If True, processing information is written to standard output.
    Returns
    -------
        None
    """

    # Print only if verbose
    vprint = print if verbose else lambda *a, **k: None
    process = psutil.Process(os.getpid())
    vprint("Using process ", process)
    t = time()
    try:
        vprint(f'Opening "{input_file}"... ', end='')
        # Obtain reader iterator
        reader = pd.read_csv(input_file, chunksize=chunk_size)
        vprint('OK')
        vprint('Initializing graph... ', end='')
        # New NetworkX directed Graph
        G = nx.DiGraph()
        vprint('OK')
        for i, chunk in enumerate(reader):
            # Add dependencies from chunk to G
            add_chunk(
                chunk, 
                G,
                dependent_field=dependent_field,
                dependency_field=dependency_field,
                filter_field=filter_field,
                filter_value=filter_value
            )
            vprint(f'{round(i*chunk_size/1e6,1)}M lines | {len(G)} nodes,{len(G.edges)} deps. ({int(time()-t)}s) {round(process.memory_info().rss/1e6,1)}Mb ')
        vprint('Done reading file')
        vprint(f'Saving network as "{output_file}"... ', end='')
        nx.write_adjlist(G, output_file)
        vprint('OK')
    except Exception as e:
        print('\n', e)

## Builds

**Bioconductor**

In [4]:
# Build Bioconductor network with scraping data
build_dependency_network(
    input_file='results/csv_datasets/bioconductor/bioconductor_adjlist_scraping.csv',
    output_file='results/network_models/bioconductor_adjlist_scraping.bz2',
    chunk_size=int(1e6),
    dependent_field="name",
    dependency_field="dependency"
)

# Build olivia model from network
on = OliviaNetwork()
on.build_model('results/network_models/bioconductor_adjlist_scraping.bz2')
on.save('results/olivia_prebuilts/bioconductor.olv')

Using process  psutil.Process(pid=189586, name='python', status='running', started='21:49:00')
Opening "results/csv_datasets/bioconductor/bioconductor_adjlist_scraping.csv"... OK
Initializing graph... OK
0.0M lines | 3509 nodes,28320 deps. (0s) 112.9Mb 
Done reading file
Saving network as "results/network_models/bioconductor_adjlist_scraping.bz2"... OK
Reading dependencies file...
Building Olivia Model
     Finding strongly connected components (SCCs)...
     Building condensation network...
     Adding structural meta-data...
     Done


**CRAN**

In [5]:
# Build CRAN network with scraping data
build_dependency_network(
    input_file='results/csv_datasets/cran/cran_adjlist_scraping.csv',
    output_file='results/network_models/cran_adjlist_scraping.bz2',
    chunk_size=1e6,
    dependent_field="name",
    dependency_field="dependency"
)

# Build olivia model from network
on = OliviaNetwork()
on.build_model('results/network_models/cran_adjlist_scraping.bz2')
on.save('results/olivia_prebuilts/cran.olv')

Using process  psutil.Process(pid=189586, name='python', status='running', started='21:49:00')
Opening "results/csv_datasets/cran/cran_adjlist_scraping.csv"... OK
Initializing graph... OK
0.0M lines | 18671 nodes,113273 deps. (0s) 185.7Mb 
Done reading file
Saving network as "results/network_models/cran_adjlist_scraping.bz2"... OK
Reading dependencies file...
Building Olivia Model
     Finding strongly connected components (SCCs)...
     Building condensation network...
     Adding structural meta-data...
     Done


**PyPi**

In [6]:
# Building PyPI network with scraping data
build_dependency_network(
    input_file='results/csv_datasets/pypi/pypi_adjlist_scraping.csv',
    output_file='results/network_models/pypi_adjlist_scraping.bz2',
    chunk_size=1e6,
    dependent_field="name",
    dependency_field="dependency"
)

# Build olivia model from network
on = OliviaNetwork()
on.build_model('results/network_models/pypi_adjlist_scraping.bz2')
on.save('results/olivia_prebuilts/pypi.olv')

Using process  psutil.Process(pid=189586, name='python', status='running', started='21:49:00')
Opening "results/csv_datasets/pypi/pypi_adjlist_scraping.csv"... OK
Initializing graph... OK
0.0M lines | 214470 nodes,933955 deps. (3s) 776.3Mb 
Done reading file
Saving network as "results/network_models/pypi_adjlist_scraping.bz2"... OK
Reading dependencies file...
Building Olivia Model
     Finding strongly connected components (SCCs)...
     Building condensation network...
     Adding structural meta-data...
     Done


**NPM**

In [7]:
# Building NPM network with scraping data
build_dependency_network(
    input_file='results/csv_datasets/npm/npm_adjlist_scraping.csv',
    output_file='results/network_models/npm_adjlist_scraping.bz2',
    chunk_size=1e6,
    dependent_field="name",
    dependency_field="dependency"
)

# Build olivia model from network
on = OliviaNetwork()
on.build_model('results/network_models/npm_adjlist_scraping.bz2')
on.save('results/olivia_prebuilts/npm.olv')

Using process  psutil.Process(pid=189586, name='python', status='running', started='21:49:00')
Opening "results/csv_datasets/npm/npm_adjlist_scraping.csv"... OK
Initializing graph... OK
0.0M lines | 245059 nodes,1000000 deps. (5s) 1729.9Mb 
1.0M lines | 482752 nodes,2000000 deps. (10s) 2272.4Mb 
2.0M lines | 698525 nodes,3000000 deps. (15s) 2756.3Mb 
3.0M lines | 888042 nodes,4000000 deps. (19s) 3295.0Mb 
4.0M lines | 1059780 nodes,4855094 deps. (25s) 3675.1Mb 
Done reading file
Saving network as "results/network_models/npm_adjlist_scraping.bz2"... OK
Reading dependencies file...
Building Olivia Model
     Finding strongly connected components (SCCs)...
     Building condensation network...
     Adding structural meta-data...
     Done
