<h1 style="font-size: 80px; color: blue"> 
Olivia Finder - Data manipulation
</h1>


# <span style="color: red">**0 - Previous requirements**</span>

## Setup venv and install requirements

In [None]:
%pip install -r requirements.txt

If you use a virtual environment, it is necessary to configure it as selected in the Jupyter kernel


## Setup library path

Make sure to run this cell to have the **olivia-finder** library at PATH

In [2]:
# Append the path to the olivia_finder package
import sys
sys.path.append('../../olivia_finder/')

## Setup configuration

It is necessary to initialize the configuration, the most comfortable and global way to do so is through an environment variable

In [3]:
# Add the environment variable OLIVIA_FINDER_CONFIG_FILE_PATH

import os
os.environ['OLIVIA_FINDER_CONFIG_FILE_PATH'] = "../../olivia_finder/config.ini"

# **Package manager object**

In [4]:
from olivia_finder.package_manager import PackageManager
from olivia_finder.data_source.csv_ds import CSVDataSource
from olivia_finder.data_source.librariesio_ds import LibrariesioDataSource
from olivia_finder.data_source.repository_scrapers.github import GithubScraper
from olivia_finder.data_source.repository_scrapers.bioconductor import BioconductorScraper
from olivia_finder.data_source.repository_scrapers.cran import CranScraper
from olivia_finder.data_source.repository_scrapers.pypi import PypiScraper
from olivia_finder.data_source.repository_scrapers.npm import NpmScraper

## 1 - Scraping based

In [5]:
bioconductor_pm_scraper = PackageManager(
    data_sources=[BioconductorScraper()]
)

Obtain a list with the packages available directly from the Scraping Data Source

In [6]:
bioconductor_pm_scraper.fetch_package_names()[300:320]

['Dino',
 'Director',
 'DirichletMultinomial',
 'DiscoRhythm',
 'DominoEffect',
 'Doscheda',
 'DriverNet',
 'DropletUtils',
 'DrugVsDisease',
 'Dune',
 'DynDoc',
 'EBImage',
 'EBSEA',
 'EBSeq',
 'EBSeqHMM',
 'EBarrays',
 'EBcoexpress',
 'EDASeq',
 'EDIRquery',
 'EGAD']

Perform the scraping process, loading the packages data in the PackageManager structure

In [7]:
# Inial len of packages list in bioconductor_pm_scraper
initial_len = len(bioconductor_pm_scraper.get_packages())

# Initialize the package manager
bioconductor_pm_scraper.initialize(show_progress=True)

# Final len of packages list in bioconductor_pm_scraper
final_len = len(bioconductor_pm_scraper.get_packages())

print(f"Initial length: {initial_len}")
print(f"Final length: {final_len}")


Loading packages: 100%|[32m██████████[0m| 2230/2230 [03:55<00:00,  9.47packages/s]

Initial length: 0
Final length: 2230





Export the package data to dict

In [8]:
bioconductor_pm_scraper.get_package("a4").to_dict()

{'name': 'a4',
 'version': '1.48.0',
 'url': 'https://www.bioconductor.org/packages/release/bioc/html/a4.html',
 'dependencies': [{'name': 'a4Base',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'a4Preproc', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Classif', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Core', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Reporting', 'version': '', 'url': None, 'dependencies': []}]}

## 2 - CSV file based

In [9]:
cran_pm_csv = PackageManager(
    data_sources=[                  # List of data sources
        CSVDataSource(
            "aux_data/cran_adjlist_test.csv",
            dependent_field="Project Name",
            dependency_field="Dependency Name",
            dependent_version_field="Version Number",
        )
    ]
)

In [10]:
cran_pm_csv.initialize(show_progress=True)

Loading packages:   0%|[32m          [0m| 0/275 [00:00<?, ?packages/s]

Loading packages: 100%|[32m██████████[0m| 275/275 [00:00<00:00, 418.67packages/s]


In [13]:
cran_pm_csv.get_package("nmfem").to_dict()

{'name': 'nmfem',
 'version': '1.0.4',
 'url': None,
 'dependencies': [{'name': 'rmarkdown',
   'version': None,
   'url': None,
   'dependencies': []},
  {'name': 'testthat', 'version': None, 'url': None, 'dependencies': []},
  {'name': 'knitr', 'version': None, 'url': None, 'dependencies': []},
  {'name': 'tidyr', 'version': None, 'url': None, 'dependencies': []},
  {'name': 'mixtools', 'version': None, 'url': None, 'dependencies': []},
  {'name': 'd3heatmap', 'version': None, 'url': None, 'dependencies': []},
  {'name': 'dplyr', 'version': None, 'url': None, 'dependencies': []},
  {'name': 'plyr', 'version': None, 'url': None, 'dependencies': []},
  {'name': 'R', 'version': None, 'url': None, 'dependencies': []}]}

## 3 - Libraries.io API based

In [14]:
maven_pm_libio = PackageManager(
    data_sources=[                
        LibrariesioDataSource(platform="maven")
    ]
)

In [15]:
maven_pm_libio.fetch_package("org.apache.commons:commons-lang3").to_dict()

{'name': 'org.apache.commons:commons-lang3',
 'version': '3.9',
 'url': 'https://repo1.maven.org/maven2/org/apache/commons/commons-lang3',
 'dependencies': [{'name': 'org.openjdk.jmh:jmh-generator-annprocess',
   'version': '1.25.2',
   'url': None,
   'dependencies': []},
  {'name': 'org.openjdk.jmh:jmh-core',
   'version': '1.25.2',
   'url': None,
   'dependencies': []},
  {'name': 'org.easymock:easymock',
   'version': '5.1.0',
   'url': None,
   'dependencies': []},
  {'name': 'org.hamcrest:hamcrest',
   'version': None,
   'url': None,
   'dependencies': []},
  {'name': 'org.junit-pioneer:junit-pioneer',
   'version': '2.0.1',
   'url': None,
   'dependencies': []},
  {'name': 'org.junit.jupiter:junit-jupiter',
   'version': '5.9.3',
   'url': None,
   'dependencies': []}]}

## 4 - Github network based

In [16]:
github_pm = PackageManager(
    data_sources=[                  # List of data sources
        GithubScraper()
    ]
)

github_pm.fetch_package("dab0012/olivia-finder").to_dict()

{'name': 'dab0012/olivia-finder',
 'version': '',
 'url': 'https://github.com/dab0012/olivia-finder',
 'dependencies': [{'name': 'inveniosoftware-contrib/intbitset',
   'version': '2.4.0',
   'url': None,
   'dependencies': []},
  {'name': 'matplotlib/matplotlib',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'networkx/networkx',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'numpy/numpy',
   'version': '1.18.5',
   'url': None,
   'dependencies': []},
  {'name': 'pandas-dev/pandas',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'getanewsletter/BeautifulSoup4',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'pybraries/pybraries',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'psf/requests', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'scipy/scipy', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'SeleniumHQ/selenium',
   'version': '',
   'url': 

## 5 - Multiple datasource based

In [17]:
bioconductor_pm_multiple = PackageManager(
    data_sources=[                  # List of data sources
        BioconductorScraper(),
        CSVDataSource(
            file_path="aux_data/cran_adjlist_test.csv",
            dependent_field="Project Name",
            dependency_field="Dependency Name",
            dependent_version_field="Version Number",
        ),
        CranScraper(),
        LibrariesioDataSource(
            platform="cran",
        )
    ]
)

In [18]:
bioconductor_pm_multiple.fetch_package("a4").to_dict()

{'name': 'a4',
 'version': '1.48.0',
 'url': 'https://www.bioconductor.org/packages/release/bioc/html/a4.html',
 'dependencies': [{'name': 'a4Base',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'a4Preproc', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Classif', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Core', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Reporting', 'version': '', 'url': None, 'dependencies': []}]}

Obtain the transitive dependencies of the package *a4*

In [22]:

a4_G = bioconductor_pm_multiple.get_transitive_network_graph("a4", deep_level=3, generate=True)
a4_G.nodes()

NodeView(('a4Base', 'a4', 'a4Preproc', 'a4Classif', 'a4Core', 'a4Reporting', 'methods', 'graphics', 'grid', 'Biobase', 'annaffy', 'mpm', 'genefilter', 'limma', 'multtest', 'glmnet', 'gplots', 'BiocGenerics', 'stats', 'R', 'utils', 'BiocManager', 'GO.db', 'AnnotationDbi', 'DBI', 'MatrixGenerics', 'annotate', 'survival', 'grDevices', 'MASS', 'stats4', 'ROCR', 'pamr', 'varSelRF', 'xtable'))