# Network builder

#### Aux

Auxiliary function to generate a graph from the structure of list of links stored in a CSV file

In [2]:
from time import time
import os
from typing import Any
import psutil
import pandas as pd
import networkx as nx


def add_chunk(
    df, G, dependent_field, dependency_field,
    filter_field=None,
    filter_value=None
):
    """ Utility method for build_dependency_network"""

    filtered = df[df[filter_field] == filter_value] if filter_field else df
    links = list(zip(filtered[dependency_field], filtered[dependent_field]))
    G.add_edges_from(links)
    return G

def build_dependency_network(
    input_file,
    output_file,
    chunk_size,
    dependent_field: str = 'Project Name',
    dependency_field: str = 'Dependency Name',
    filter_field = None,
    filter_value = None,
    verbose: bool = True
) -> None:

    """
    Builds a dependency network from a file with package dependencies information

    Reads from a CSV file and writes to a txt file with adjacency lists
    corresponding to network model. Compression methods are inferred from file
    extension (.gz and .bz2 are supported from NetworkX IO methods)

    Parameters
    ----------
    input_file : str
        Path to csv file with dependencies information
    output_file : str
        Path to write resulting network file
    chunk_size : int
        Amount of lines to be read at once from input_file in batch  processing.
    dependent_field : str
        DataFrame column Id for the dependent package
    dependency_field : str
        Dataframe column Id for the dependency package
    filter_field : str, optional
        If not None, only add records where filter_field equals filter_value
    filter_value : str, optional
        If not None, only add records where filter_field equals filter_value
    verbose: bool, optional
        If True, processing information is written to standard output.
    Returns
    -------
        None
    """

    # Print only if verbose
    vprint = print if verbose else lambda *a, **k: None
    process = psutil.Process(os.getpid())
    vprint("Using process ", process)
    t = time()
    try:
        vprint(f'Opening "{input_file}"... ', end='')
        # Obtain reader iterator
        reader = pd.read_csv(input_file, chunksize=chunk_size)
        vprint('OK')
        vprint('Initializing graph... ', end='')
        # New NetworkX directed Graph
        G = nx.DiGraph()
        vprint('OK')
        for i, chunk in enumerate(reader):
            # Add dependencies from chunk to G
            add_chunk(
                chunk, 
                G,
                dependent_field=dependent_field,
                dependency_field=dependency_field,
                filter_field=filter_field,
                filter_value=filter_value
            )
            vprint(f'{round(i*chunk_size/1e6,1)}M lines | {len(G)} nodes,{len(G.edges)} deps. ({int(time()-t)}s) {round(process.memory_info().rss/1e6,1)}Mb ')
        vprint('Done reading file')
        vprint(f'Saving network as "{output_file}"... ', end='')
        nx.write_adjlist(G, output_file)
        vprint('OK')
    except Exception as e:
        print('\n', e)

## Builds

**Bioconductor**

In [4]:
# Build Bioconductor network with scraping data
build_dependency_network(
    input_file='../results/csv_datasets/bioconductor/bioconductor_adjlist_scraping.csv',
    output_file='../results/network_models/bioconductor_adjlist_scraping.bz2',
    chunk_size=int(1e6),
    dependent_field="name",
    dependency_field="dependency"
)

Using process  psutil.Process(pid=71182, name='python', status='running', started='16:36:36')
Opening "../results/csv_datasets/bioconductor/bioconductor_adjlist_scraping.csv"... OK
Initializing graph... OK
0.0M lines | 3509 nodes,28320 deps. (0s) 122.7Mb 
Done reading file
Saving network as "../results/network_models/bioconductor_adjlist_scraping.bz2"... OK


**CRAN**

In [5]:
# Build CRAN network with scraping data
build_dependency_network(
    input_file='../results/csv_datasets/cran/cran_adjlist_scraping.csv',
    output_file='../results/network_models/cran_adjlist_scraping.bz2',
    chunk_size=1e6,
    dependent_field="name",
    dependency_field="dependency"
)

Using process  psutil.Process(pid=71182, name='python', status='running', started='16:36:36')
Opening "../results/csv_datasets/cran/cran_adjlist_scraping.csv"... OK
Initializing graph... OK
0.0M lines | 18671 nodes,113273 deps. (0s) 168.8Mb 
Done reading file
Saving network as "../results/network_models/cran_adjlist_scraping.bz2"... OK


In [6]:
# build CRAN network with libraries.io data
build_dependency_network(
    input_file='../results/csv_datasets/cran/cran_adjlist_librariesio_filtered_(imports_depends).csv',
    output_file='../results/network_models/cran_adjlist_librariesio_filtered_(imports_depends).bz2',
    chunk_size=1e6,
    dependent_field="Project Name",
    dependency_field="Dependency Name"
)

Using process  psutil.Process(pid=71182, name='python', status='running', started='16:36:36')
Opening "../results/csv_datasets/cran/cran_adjlist_librariesio_filtered_(imports_depends).csv"... OK
Initializing graph... OK
0.0M lines | 15647 nodes,76207 deps. (0s) 159.4Mb 
Done reading file
Saving network as "../results/network_models/cran_adjlist_librariesio_filtered_(imports_depends).bz2"... OK


**PyPi**

In [7]:
# Building PyPI network with scraping data
build_dependency_network(
    input_file='../results/csv_datasets/pypi/pypi_adjlist_scraping.csv',
    output_file='../results/network_models/pypi_adjlist_scraping.bz2',
    chunk_size=1e6,
    dependent_field="name",
    dependency_field="dependency"
)

Using process  psutil.Process(pid=71182, name='python', status='running', started='16:36:36')
Opening "../results/csv_datasets/pypi/pypi_adjlist_scraping.csv"... OK
Initializing graph... OK
0.0M lines | 214470 nodes,933955 deps. (3s) 666.8Mb 
Done reading file
Saving network as "../results/network_models/pypi_adjlist_scraping.bz2"... OK


In [8]:
# build CRAN network with libraries.io data
build_dependency_network(
    input_file='../results/csv_datasets/pypi/pypi_adjlist_librariesio_filtered.csv',
    output_file='../results/network_models/pypi_adjlist_librariesio_filtered.bz2',
    chunk_size=1e6,
    dependent_field="Project Name",
    dependency_field="Dependency Name"
)

Using process  psutil.Process(pid=71182, name='python', status='running', started='16:36:36')
Opening "../results/csv_datasets/pypi/pypi_adjlist_librariesio_filtered.csv"... OK
Initializing graph... OK
0.0M lines | 49306 nodes,134575 deps. (0s) 253.1Mb 
Done reading file
Saving network as "../results/network_models/pypi_adjlist_librariesio_filtered.bz2"... OK


**NPM**

In [None]:
# Building NPM network with scraping data
build_dependency_network(
    input_file='../results/csv_datasets/npm/npm_adjlist_scraping.csv',
    output_file='../results/network_models/npm_adjlist_scraping.bz2',
    chunk_size=1e6,
    dependent_field="name",
    dependency_field="dependency"
)

In [7]:
# Building NPM network with scraping data
build_dependency_network(
    input_file='../results/csv_datasets/npm/npm_adjlist_librariesio_filtered.csv',
    output_file='../results/network_models/npm_adjlist_librariesio_filtered.bz2',
    chunk_size=1e6,
    dependent_field="Project Name",
    dependency_field="Dependency Name"
)

Using process  psutil.Process(pid=92719, name='python', status='running', started='18:18:06')
Opening "../results/csv_datasets/npm/npm_adjlist_librariesio_filtered.csv"... OK
Initializing graph... OK
0.0M lines | 140404 nodes,998885 deps. (2s) 593.1Mb 
1.0M lines | 252639 nodes,1998725 deps. (5s) 1033.4Mb 
2.0M lines | 353102 nodes,2998630 deps. (8s) 1467.3Mb 
3.0M lines | 445359 nodes,3998390 deps. (11s) 1870.1Mb 
4.0M lines | 537958 nodes,4998329 deps. (15s) 2261.2Mb 
5.0M lines | 669991 nodes,5998191 deps. (19s) 2689.3Mb 
6.0M lines | 747240 nodes,6998046 deps. (21s) 3198.0Mb 
7.0M lines | 820506 nodes,7997938 deps. (26s) 3539.0Mb 
8.0M lines | 896257 nodes,8986097 deps. (28s) 3892.5Mb 
9.0M lines | 971594 nodes,9985862 deps. (31s) 4274.2Mb 
10.0M lines | 1036116 nodes,10985745 deps. (36s) 4642.3Mb 
11.0M lines | 1064572 nodes,11405277 deps. (37s) 4793.5Mb 
Done reading file
Saving network as "../results/network_models/npm_adjlist_librariesio_filtered.bz2"... OK
