### Studying Vulnerabilities in Package Dependency Networks
Code for building dependency networks from [libraries.io](https://libraries.io) data dump.

Input file _dependencies-1.6.0-2020-01-12.csv_ not included in this distribution (20 GB, 1.5 GB zip-compressed). Available at https://libraries.io/data as part of _libraries-1.6.0-2020-01-12.tar.gz_

In [5]:
from time import time
import os
import psutil
import gc
import pandas as pd
import networkx as nx

def add_chunk(df, G,
              dependent_field,
              dependency_field,
              filter_field=None,
              filter_value=None):
    
    """ Utility method for build_dependency_network
    """
    if filter_field:
        filtered = df[df[filter_field]==filter_value]
    else:
        filtered = df      
    links = [(dependency, dependent) for (dependency, dependent) 
             in zip(filtered[dependency_field], filtered[dependent_field])]
    G.add_edges_from(links)  
    return G

def build_dependency_network(input_file, 
                             output_file, 
                             chunk_size, 
                             dependent_field,
                             dependency_field,
                             filter_field=None, 
                             filter_value=None,
                             verbose=True):
    """
    Builds a dependency network from a file with package dependencies information
    
    Reads from a CSV file and writes to a txt file with adjacency lists
    corresponding to network model. Compression methods are inferred from file
    extension (.gz and .bz2 are supported from NetworkX IO methods)
    
    Parameters
    ----------
    input_file : str
        Path to csv file with dependencies information
    output_file : str
        Path to write resulting network file
    chunk_size : int
        Amount of lines to be read at once from input_file in batch  processing.
    dependent_field : str
        DataFrame column Id for the dependent package
    dependency_field : str
        Dataframe column Id for the dependency package
    filter_field : str, optional
        If not None, only add records where filter_field equals filter_value
    filter_value : str, optional
        If not None, only add records where filter_field equals filter_value
    verbose: bool, optional
        If True, processing information is written to standard output.
    Returns
    -------
        None
    """
    
    # Print only if verbose
    vprint = print if verbose else lambda *a, **k: None
    process = psutil.Process(os.getpid())
    vprint("Using process ", process)
    t = time()
    try:   
        vprint(f'Opening "{input_file}"... ', end='') 
        # Obtain reader iterator
        reader = pd.read_csv(input_file, chunksize=chunk_size)
        vprint('OK')
        vprint('Initializing graph... ', end='')  
        # New NetworkX directed Graph
        G = nx.DiGraph()
        vprint('OK')
        for i,chunk in enumerate(reader):     
            # Add dependencies from chunk to G
            add_chunk(
                chunk, G,
                dependent_field=dependent_field,
                dependency_field=dependency_field,
                filter_field=filter_field, filter_value=filter_value
            )
            vprint(f'{round(i*chunk_size/1e6,1)}M lines | {len(G)} nodes,{len(G.edges)} deps. ({int(time()-t)}s) {round(process.memory_info().rss/1e6,1)}Mb ')
        vprint('Done reading file')
        vprint(f'Saving network as "{output_file}"... ',end='')
        nx.write_adjlist(G,output_file)
        vprint('OK')
    except Exception as e:
        print('\n',e)

In [7]:
CRAN_dataset = '../persistence/cran_dependencies.csv'

# Building CRAN dependency network
build_dependency_network(
    CRAN_dataset,                    # Input file
    'networks/cran_nrework.bz2',     # Output file
    1e6,                             # Chunk size                    
    dependent_field='dependency',    # Dependency field
    dependency_field='name'          # Dependent field
)

Using process  psutil.Process(pid=4492, name='python3', status='running', started='15:45:50')
Opening "../persistence/cran_dependencies.csv"... OK
Initializing graph... OK
0.0M lines | 18671 nodes,113273 deps. (0s) 154.7Mb 
Done reading file
Saving network as "networks/cran_nrework.bz2"... OK


In [10]:
bioconductor_dataset = '../persistence/bioconductor_dependencies.csv'

# Building bioconductor dependency network
build_dependency_network(
    bioconductor_dataset,                   # Input file
    'networks/bioconductor_nrework.bz2',    # Output file
    1e6,                                    # Chunk size
    dependent_field='dependency',           # Dependency field
    dependency_field='name'                 # Dependent field
)

Using process  psutil.Process(pid=4492, name='python3', status='running', started='15:45:50')
Opening "../persistence/bioconductor_dependencies.csv"... OK
Initializing graph... OK
0.0M lines | 3444 nodes,27627 deps. (0s) 401.9Mb 
Done reading file
Saving network as "networks/bioconductor_nrework.bz2"... OK


In [9]:
pypi_dataset = '../persistence/pypi_dependencies.csv'

# Building PyPI dependency network
build_dependency_network(
    pypi_dataset,                   # Input file
    'networks/pypi_nrework.bz2',    # Output file
    1e6,                            # Chunk size
    dependent_field='dependency',   # Dependency field
    dependency_field='name'         # Dependent field
)

Using process  psutil.Process(pid=4492, name='python3', status='running', started='15:45:50')
Opening "../persistence/pypi_dependencies.csv"... OK
Initializing graph... OK
0.0M lines | 209356 nodes,928400 deps. (3s) 435.3Mb 
1.0M lines | 220292 nodes,987442 deps. (3s) 391.6Mb 
Done reading file
Saving network as "networks/pypi_nrework.bz2"... OK
