<h1 style="font-size: 80px; color: blue"> 
Olivia Finder - Data manipulation
</h1>


# <span style="color: red">**0 - Previous requirements**</span>

## Setup venv and install requirements

In [22]:
# %pip install -r requirements.txt

If you use a virtual environment, it is necessary to configure it as selected in the Jupyter kernel


## Setup library path

Make sure to run this cell to have the **olivia-finder** library at PATH

In [2]:
# Append the path to the olivia_finder package
import sys
sys.path.append('../../olivia_finder/')

## Setup configuration

It is necessary to initialize the configuration, the most comfortable and global way to do so is through an environment variable

In [3]:
# Add the environment variable OLIVIA_FINDER_CONFIG_FILE_PATH

import os
os.environ['OLIVIA_FINDER_CONFIG_FILE_PATH'] = "../../olivia_finder/olivia_finder/config.ini"

# Initialize a package manager object

In [4]:
from olivia_finder.package_manager import PackageManager
from olivia_finder.data_source.csv_ds import CSVDataSource
from olivia_finder.data_source.librariesio_ds import LibrariesioDataSource
from olivia_finder.data_source.repository_scrapers.github import GithubScraper
from olivia_finder.data_source.repository_scrapers.bioconductor import BioconductorScraper
from olivia_finder.data_source.repository_scrapers.cran import CranScraper
from olivia_finder.data_source.repository_scrapers.pypi import PypiScraper
from olivia_finder.data_source.repository_scrapers.npm import NpmScraper

Scraping based

In [26]:
bioconductor_pm_scraper = PackageManager(
    data_sources=[                  # List of data sources
        BioconductorScraper(),
    ]
)

In [30]:
bioconductor_pm_scraper.fetch_package_names()[300:320]

['Dino',
 'Director',
 'DirichletMultinomial',
 'DiscoRhythm',
 'DominoEffect',
 'Doscheda',
 'DriverNet',
 'DropletUtils',
 'DrugVsDisease',
 'Dune',
 'DynDoc',
 'EBImage',
 'EBSEA',
 'EBSeq',
 'EBSeqHMM',
 'EBarrays',
 'EBcoexpress',
 'EDASeq',
 'EDIRquery',
 'EGAD']

In [None]:
initial_len = len(bioconductor_pm_scraper.get_packages())
bioconductor_pm_scraper.initialize()
final_len = len(bioconductor_pm_scraper.get_packages())
print(f"Initial length: {initial_len}")
print(f"Final length: {final_len}")


Initial length: 0
Final length: 2230


In [32]:
bioconductor_pm_scraper.get_package("a4").to_dict()

{'name': 'a4',
 'version': '1.48.0',
 'url': 'https://www.bioconductor.org/packages/release/bioc/html/a4.html',
 'dependencies': [{'name': 'a4Base',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'a4Preproc', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Classif', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Core', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Reporting', 'version': '', 'url': None, 'dependencies': []}]}

Csv file based

In [5]:
cran_pm_csv = PackageManager(
    data_sources=[                  # List of data sources
        CSVDataSource(
            "results/csv_datasets/cran/cran_adjlist_librariesio.csv",
            dependent_field="Project Name",
            dependency_field="Dependency Name",
            dependent_version_field="Version Number",
        )
    ]
)

In [7]:
cran_pm_csv.initialize(show_progress=True)

Loading packages:   0%|[32m          [0m| 0/15522 [00:00<?, ?packages/s]

Loading packages: 100%|[32m██████████[0m| 15522/15522 [05:29<00:00, 47.09packages/s]


In [None]:
cran_pm_csv.get_package("A3").to_dict()

{'name': 'A3',
 'version': '1.0.0',
 'url': None,
 'dependencies': [{'name': 'R',
   'version': None,
   'url': None,
   'dependencies': []},
  {'name': 'randomForest', 'version': None, 'url': None, 'dependencies': []}]}

librariesio api based

In [10]:
maven_pm_libio = PackageManager(
    data_sources=[                  # List of data sources
        LibrariesioDataSource(platform="maven")
    ]
)

In [15]:
maven_pm_libio.fetch_package("org.apache.commons:commons-lang3").to_dict()

[32;20m2023-05-26 00:24:05,682 [olivia_finder.librariesio(DEBUG)] -> librariesio_ds.py:104[0m
Obtaining data of org.apache.commons:commons-lang3
[32;20m2023-05-26 00:24:23,165 [olivia_finder.librariesio(DEBUG)] -> librariesio_ds.py:133[0m
Package org.apache.commons:commons-lang3 found


{'name': 'org.apache.commons:commons-lang3',
 'version': '3.9',
 'url': 'https://repo1.maven.org/maven2/org/apache/commons/commons-lang3',
 'dependencies': [{'name': 'org.openjdk.jmh:jmh-generator-annprocess',
   'version': '1.25.2',
   'url': None,
   'dependencies': []},
  {'name': 'org.openjdk.jmh:jmh-core',
   'version': '1.25.2',
   'url': None,
   'dependencies': []},
  {'name': 'org.easymock:easymock',
   'version': '5.1.0',
   'url': None,
   'dependencies': []},
  {'name': 'org.hamcrest:hamcrest',
   'version': None,
   'url': None,
   'dependencies': []},
  {'name': 'org.junit-pioneer:junit-pioneer',
   'version': '2.0.1',
   'url': None,
   'dependencies': []},
  {'name': 'org.junit.jupiter:junit-jupiter',
   'version': '5.9.3',
   'url': None,
   'dependencies': []}]}

Multiple datasource based

In [17]:
bioconductor_pm_multiple = PackageManager(
    data_sources=[                  # List of data sources
        BioconductorScraper(),
        CSVDataSource(
            file_path="results/csv_datasets/cran/cran_adjlist_scraping.csv",
            dependent_field="name",
            dependency_field="dependency",
        ),
        CranScraper(),
        LibrariesioDataSource(
            platform="cran",
        )
    ]
)

In [18]:
bioconductor_pm_multiple.fetch_package("a4").to_dict()

{'name': 'a4',
 'version': '1.48.0',
 'url': 'https://www.bioconductor.org/packages/release/bioc/html/a4.html',
 'dependencies': [{'name': 'a4Base',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'a4Preproc', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Classif', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Core', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Reporting', 'version': '', 'url': None, 'dependencies': []}]}

In [None]:

a4_G = bioconductor_pm_multiple.get_dependency_network("a4", deep_level=3, generate=True)
a4_G.nodes()

Worker 0: Error doing request job: <Response [404]>
Request for methods: https://www.bioconductor.org/packages/release/bioc/html/methods.html failed: response is None
Worker 0: Error doing request job: <Response [404]>
Request for graphics: https://www.bioconductor.org/packages/release/bioc/html/graphics.html failed: response is None
Worker 0: Error doing request job: <Response [404]>
Request for grid: https://www.bioconductor.org/packages/release/bioc/html/grid.html failed: response is None
Worker 0: Error doing request job: <Response [404]>
Request for mpm: https://www.bioconductor.org/packages/release/bioc/html/mpm.html failed: response is None
Worker 0: Error doing request job: <Response [404]>
Request for glmnet: https://www.bioconductor.org/packages/release/bioc/html/glmnet.html failed: response is None
Worker 0: Error doing request job: <Response [404]>
Request for gplots: https://www.bioconductor.org/packages/release/bioc/html/gplots.html failed: response is None
Worker 0: Erro

NodeView(('a4', 'a4Base', 'a4Preproc', 'a4Classif', 'a4Core', 'a4Reporting', 'methods', 'graphics', 'grid', 'Biobase', 'annaffy', 'mpm', 'genefilter', 'limma', 'multtest', 'glmnet', 'gplots', 'BiocGenerics', 'stats', 'R', 'utils', 'BiocManager', 'GO.db', 'AnnotationDbi', 'DBI', 'MatrixGenerics', 'annotate', 'survival', 'grDevices', 'MASS', 'stats4', 'ROCR', 'pamr', 'varSelRF', 'xtable'))