### Studying Vulnerabilities in Package Dependency Networks
Code for building dependency networks from [libraries.io](https://libraries.io) data dump.

Input file _dependencies-1.6.0-2020-01-12.csv_ not included in this distribution (20 GB, 1.5 GB zip-compressed). Available at https://libraries.io/data as part of _libraries-1.6.0-2020-01-12.tar.gz_

In [37]:
from time import time
import os
import psutil
import gc
import pandas as pd
import networkx as nx

def add_chunk(df, G,
              dependent_field,
              dependency_field,
              filter_field=None,
              filter_value=None):
    
    """ Utility method for build_dependency_network
    """
    if filter_field:
        filtered = df[df[filter_field]==filter_value]
    else:
        filtered = df      
    links = [(dependency, dependent) for (dependency, dependent) 
             in zip(filtered[dependency_field], filtered[dependent_field])]
    G.add_edges_from(links)  
    return G

def build_dependency_network(input_file, 
                             output_file, 
                             chunk_size, 
                             dependent_field='Project Name',
                             dependency_field='Dependency Name',
                             filter_field=None, 
                             filter_value=None,
                             verbose=True):
    """
    Builds a dependency network from a file with package dependencies information
    
    Reads from a CSV file and writes to a txt file with adjacency lists
    corresponding to network model. Compression methods are inferred from file
    extension (.gz and .bz2 are supported from NetworkX IO methods)
    
    Parameters
    ----------
    input_file : str
        Path to csv file with dependencies information
    output_file : str
        Path to write resulting network file
    chunk_size : int
        Amount of lines to be read at once from input_file in batch  processing.
    dependent_field : str
        DataFrame column Id for the dependent package
    dependency_field : str
        Dataframe column Id for the dependency package
    filter_field : str, optional
        If not None, only add records where filter_field equals filter_value
    filter_value : str, optional
        If not None, only add records where filter_field equals filter_value
    verbose: bool, optional
        If True, processing information is written to standard output.
    Returns
    -------
        None
    """
    
    # Print only if verbose
    vprint = print if verbose else lambda *a, **k: None
    process = psutil.Process(os.getpid())
    vprint("Using process ", process)
    t = time()
    try:   
        vprint(f'Opening "{input_file}"... ', end='') 
        # Obtain reader iterator
        reader = pd.read_csv(input_file, chunksize=chunk_size)
        vprint('OK')
        vprint('Initializing graph... ', end='')  
        # New NetworkX directed Graph
        G = nx.DiGraph()
        vprint('OK')
        for i,chunk in enumerate(reader):     
            # Add dependencies from chunk to G
            add_chunk(chunk, G, filter_field=filter_field, filter_value=filter_value)
            vprint(f'{round(i*chunk_size/1e6,1)}M lines | {len(G)} nodes,{len(G.edges)} deps. ({int(time()-t)}s) {round(process.memory_info().rss/1e6,1)}Mb ')
        vprint('Done reading file')
        vprint(f'Saving network as "{output_file}"... ',end='')
        nx.write_adjlist(G,output_file)
        vprint('OK')
    except Exception as e:
        print('\n',e)

In [20]:
# Building PyPI dependency network
build_dependency_network('dependencies-1.6.0-2020-01-12.zip',
                         'pypi-dependencies-net-2020-01-12.bz2',
                         1e6,
                         filter_field='Platform',
                         filter_value='Pypi')

PID  psutil.Process(pid=3832, name='python.exe', started='10:36:50')
Opening "dependencies-1.6.0-2020-01-12.zip"... OK
Initializing graph... OK
0.0M lines| 5001 nodes,10661 deps. (2s) 270.6Mb 
1.0M lines| 6255 nodes,14617 deps. (5s) 264.2Mb 
2.0M lines| 6255 nodes,14617 deps. (7s) 258.1Mb 
3.0M lines| 6276 nodes,14677 deps. (10s) 258.2Mb 
4.0M lines| 6276 nodes,14677 deps. (12s) 258.1Mb 
5.0M lines| 6276 nodes,14677 deps. (14s) 258.2Mb 
6.0M lines| 6276 nodes,14677 deps. (16s) 258.2Mb 
7.0M lines| 6279 nodes,14699 deps. (18s) 258.2Mb 
8.0M lines| 6279 nodes,14699 deps. (21s) 258.2Mb 
9.0M lines| 6279 nodes,14699 deps. (23s) 258.2Mb 
10.0M lines| 6279 nodes,14699 deps. (25s) 258.2Mb 
11.0M lines| 6279 nodes,14699 deps. (27s) 258.2Mb 
12.0M lines| 6279 nodes,14699 deps. (29s) 258.2Mb 
13.0M lines| 6279 nodes,14699 deps. (31s) 258.2Mb 
14.0M lines| 6279 nodes,14699 deps. (34s) 258.2Mb 
15.0M lines| 6286 nodes,14719 deps. (36s) 258.2Mb 
16.0M lines| 6287 nodes,14722 deps. (39s) 258.2Mb 
17

153.0M lines| 29556 nodes,92985 deps. (394s) 298.5Mb 
154.0M lines| 29890 nodes,93990 deps. (397s) 294.4Mb 
155.0M lines| 30345 nodes,95439 deps. (399s) 295.8Mb 
156.0M lines| 30727 nodes,96588 deps. (402s) 296.0Mb 
157.0M lines| 31371 nodes,98936 deps. (405s) 298.5Mb 
158.0M lines| 31760 nodes,100086 deps. (407s) 298.0Mb 
159.0M lines| 32156 nodes,101374 deps. (410s) 298.3Mb 
160.0M lines| 32535 nodes,102478 deps. (412s) 300.5Mb 
161.0M lines| 32691 nodes,102953 deps. (415s) 302.4Mb 
162.0M lines| 33168 nodes,104458 deps. (418s) 302.3Mb 
163.0M lines| 33615 nodes,106052 deps. (420s) 303.0Mb 
164.0M lines| 34208 nodes,107535 deps. (423s) 302.3Mb 
165.0M lines| 34717 nodes,108986 deps. (426s) 303.1Mb 
166.0M lines| 35106 nodes,110046 deps. (428s) 304.3Mb 
167.0M lines| 35720 nodes,111849 deps. (431s) 305.9Mb 
168.0M lines| 36040 nodes,112753 deps. (434s) 306.0Mb 
169.0M lines| 36428 nodes,113810 deps. (437s) 307.0Mb 
170.0M lines| 36789 nodes,114986 deps. (439s) 308.5Mb 
171.0M lines| 3

In [38]:
# Building npm dependency network
build_dependency_network('dependencies-1.6.0-2020-01-12.zip',
                         'npm-dependencies-net-2020-01-12.bz2',
                         int(1e6),
                         filter_field='Platform',
                         filter_value='NPM')

Using process  psutil.Process(pid=3832, name='python.exe', started='10:36:50')
Opening "dependencies-1.6.0-2020-01-12.zip"... OK
Initializing graph... OK
0.0M lines | 304 nodes,380 deps. (2s) 558.4Mb 
1.0M lines | 304 nodes,380 deps. (4s) 549.8Mb 
2.0M lines | 10312 nodes,34856 deps. (8s) 572.7Mb 
3.0M lines | 27026 nodes,109620 deps. (12s) 610.2Mb 
4.0M lines | 41921 nodes,201253 deps. (16s) 644.6Mb 
5.0M lines | 55510 nodes,288716 deps. (20s) 688.7Mb 
6.0M lines | 67226 nodes,362449 deps. (25s) 719.7Mb 
7.0M lines | 68442 nodes,370846 deps. (27s) 718.8Mb 
8.0M lines | 68442 nodes,370846 deps. (29s) 713.6Mb 
9.0M lines | 69543 nodes,377452 deps. (32s) 719.1Mb 
10.0M lines | 78704 nodes,436549 deps. (36s) 744.0Mb 
11.0M lines | 90358 nodes,518686 deps. (40s) 790.9Mb 
12.0M lines | 100104 nodes,591042 deps. (44s) 820.0Mb 
13.0M lines | 109024 nodes,667876 deps. (49s) 848.8Mb 
14.0M lines | 117488 nodes,739409 deps. (53s) 877.7Mb 
15.0M lines | 124429 nodes,795924 deps. (57s) 900.0Mb 
16

140.0M lines | 755798 nodes,8558775 deps. (619s) 4124.2Mb 
141.0M lines | 757161 nodes,8578371 deps. (624s) 4132.8Mb 
142.0M lines | 761669 nodes,8642537 deps. (628s) 4156.9Mb 
143.0M lines | 764822 nodes,8688972 deps. (633s) 4176.8Mb 
144.0M lines | 769158 nodes,8748810 deps. (637s) 4202.4Mb 
145.0M lines | 774059 nodes,8821817 deps. (642s) 4234.0Mb 
146.0M lines | 779318 nodes,8899026 deps. (647s) 4268.0Mb 
147.0M lines | 786217 nodes,8993064 deps. (652s) 4266.2Mb 
148.0M lines | 787656 nodes,9017204 deps. (657s) 4274.1Mb 
149.0M lines | 792902 nodes,9092023 deps. (661s) 4309.5Mb 
150.0M lines | 798729 nodes,9172280 deps. (666s) 4317.2Mb 
151.0M lines | 806424 nodes,9280898 deps. (671s) 4354.2Mb 
152.0M lines | 813118 nodes,9376856 deps. (676s) 4356.9Mb 
153.0M lines | 819147 nodes,9466294 deps. (680s) 4399.8Mb 
154.0M lines | 825085 nodes,9551759 deps. (685s) 4421.9Mb 
155.0M lines | 829278 nodes,9610491 deps. (690s) 4445.3Mb 
156.0M lines | 834252 nodes,9680963 deps. (695s) 4471.8M