<h1 style="font-size: 80px; color: blue"> 
Olivia Finder - Data manipulation
</h1>


# <span style="color: red">**0 - Previous requirements**</span>

## Setup venv and install requirements

In [1]:
%pip install -r requirements.txt

Collecting pandas
  Downloading pandas-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting requests
  Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 KB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting BeautifulSoup4
  Using cached beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
Collecting selenium
  Using cached selenium-4.9.1-py3-none-any.whl (6.6 MB)
Collecting networkx
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting matplotlib
  Downloading matplotlib-3.7.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00

If you use a virtual environment, it is necessary to configure it as selected in the Jupyter kernel


## Setup library path

Make sure to run this cell to have the **olivia-finder** library at PATH

In [1]:
# Append the path to the olivia_finder package
import sys
sys.path.append('../../olivia_finder/')

## Setup configuration

It is necessary to initialize the configuration, the most comfortable and global way to do so is through an environment variable

In [2]:
# Add the environment variable OLIVIA_FINDER_CONFIG_FILE_PATH

import os
os.environ['OLIVIA_FINDER_CONFIG_FILE_PATH'] = "../../olivia_finder/config.ini"

# **Package manager object**

In [3]:
from olivia_finder.package_manager import PackageManager
from olivia_finder.data_source.csv_ds import CSVDataSource
from olivia_finder.data_source.librariesio_ds import LibrariesioDataSource
from olivia_finder.data_source.repository_scrapers.github import GithubScraper
from olivia_finder.data_source.repository_scrapers.bioconductor import BioconductorScraper
from olivia_finder.data_source.repository_scrapers.cran import CranScraper
from olivia_finder.data_source.repository_scrapers.pypi import PypiScraper
from olivia_finder.data_source.repository_scrapers.npm import NpmScraper

## 1 - Scraping based

In [4]:
bioconductor_pm_scraper = PackageManager(
    data_sources=[BioconductorScraper()]
)

Obtain a list with the packages available directly from the Scraping Data Source

In [5]:
bioconductor_pm_scraper.fetch_package_names()[300:320]

['Dino',
 'Director',
 'DirichletMultinomial',
 'DiscoRhythm',
 'DominoEffect',
 'Doscheda',
 'DriverNet',
 'DropletUtils',
 'DrugVsDisease',
 'Dune',
 'DynDoc',
 'EBImage',
 'EBSEA',
 'EBSeq',
 'EBSeqHMM',
 'EBarrays',
 'EBcoexpress',
 'EDASeq',
 'EDIRquery',
 'EGAD']

Perform the scraping process, loading the packages data in the PackageManager structure

In [8]:
# Inial len of packages list in bioconductor_pm_scraper
initial_len = len(bioconductor_pm_scraper.get_packages())

# Initialize the package manager
bioconductor_pm_scraper.initialize(show_progress=True)

# Final len of packages list in bioconductor_pm_scraper
final_len = len(bioconductor_pm_scraper.get_packages())

print(f"Initial length: {initial_len}")
print(f"Final length: {final_len}")


Loading packages: 100%|[32m██████████[0m| 2230/2230 [06:04<00:00,  6.11packages/s]

Initial length: 0
Final length: 2230





Export the package data to dict

In [9]:
bioconductor_pm_scraper.get_package("a4").to_dict()

{'name': 'a4',
 'version': '1.48.0',
 'url': 'https://www.bioconductor.org/packages/release/bioc/html/a4.html',
 'dependencies': [{'name': 'a4Base',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'a4Preproc', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Classif', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Core', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Reporting', 'version': '', 'url': None, 'dependencies': []}]}

## 2 - CSV file based

In [10]:
cran_pm_csv = PackageManager(
    data_sources=[                  # List of data sources
        CSVDataSource(
            "results/csv_datasets/cran/cran_adjlist_librariesio.csv",
            dependent_field="Project Name",
            dependency_field="Dependency Name",
            dependent_version_field="Version Number",
        )
    ]
)

In [11]:
cran_pm_csv.initialize(show_progress=True)

Loading packages: 100%|[32m██████████[0m| 15522/15522 [08:39<00:00, 29.85packages/s]


In [12]:
cran_pm_csv.get_package("A3").to_dict()

{'name': 'A3',
 'version': '1.0.0',
 'url': None,
 'dependencies': [{'name': 'R',
   'version': None,
   'url': None,
   'dependencies': []},
  {'name': 'randomForest', 'version': None, 'url': None, 'dependencies': []}]}

## 3 - Libraries.io API based

In [13]:
maven_pm_libio = PackageManager(
    data_sources=[                  # List of data sources
        LibrariesioDataSource(platform="maven")
    ]
)

In [14]:
maven_pm_libio.fetch_package("org.apache.commons:commons-lang3").to_dict()

[32;20m2023-06-04 18:46:16,346 [olivia_finder.librariesio(DEBUG)] -> librariesio_ds.py:104[0m
Obtaining data of org.apache.commons:commons-lang3
[32;20m2023-06-04 18:46:17,731 [olivia_finder.librariesio(DEBUG)] -> librariesio_ds.py:133[0m
Package org.apache.commons:commons-lang3 found


{'name': 'org.apache.commons:commons-lang3',
 'version': '3.9',
 'url': 'https://repo1.maven.org/maven2/org/apache/commons/commons-lang3',
 'dependencies': [{'name': 'org.openjdk.jmh:jmh-generator-annprocess',
   'version': '1.25.2',
   'url': None,
   'dependencies': []},
  {'name': 'org.openjdk.jmh:jmh-core',
   'version': '1.25.2',
   'url': None,
   'dependencies': []},
  {'name': 'org.easymock:easymock',
   'version': '5.1.0',
   'url': None,
   'dependencies': []},
  {'name': 'org.hamcrest:hamcrest',
   'version': None,
   'url': None,
   'dependencies': []},
  {'name': 'org.junit-pioneer:junit-pioneer',
   'version': '2.0.1',
   'url': None,
   'dependencies': []},
  {'name': 'org.junit.jupiter:junit-jupiter',
   'version': '5.9.3',
   'url': None,
   'dependencies': []}]}

## 4 - Github network based

In [4]:
github_pm = PackageManager(
    data_sources=[                  # List of data sources
        GithubScraper()
    ]
)

github_pm.fetch_package("dab0012/olivia-finder").to_dict()

{'name': 'dab0012/olivia-finder',
 'version': '',
 'url': 'https://github.com/dab0012/olivia-finder',
 'dependencies': [{'name': 'intbitset',
   'version': '2.4.0',
   'url': None,
   'dependencies': []},
  {'name': 'matplotlib', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'networkx', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'numpy', 'version': '1.18.5', 'url': None, 'dependencies': []},
  {'name': 'pandas', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'beautifulsoup4', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'pybraries', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'requests', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'selenium', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'tqdm', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'typing-extensions',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'singleton-decorator',
   'version': '',

## 5 - Multiple datasource based

In [8]:
bioconductor_pm_multiple = PackageManager(
    data_sources=[                  # List of data sources
        BioconductorScraper(),
        CSVDataSource(
            file_path="../results/csv_datasets/cran/cran_adjlist_scraping.csv",
            dependent_field="name",
            dependency_field="dependency",
        ),
        CranScraper(),
        LibrariesioDataSource(
            platform="cran",
        )
    ]
)

In [9]:
bioconductor_pm_multiple.fetch_package("a4").to_dict()

{'name': 'a4',
 'version': '1.48.0',
 'url': 'https://www.bioconductor.org/packages/release/bioc/html/a4.html',
 'dependencies': [{'name': 'a4Base',
   'version': '',
   'url': None,
   'dependencies': []},
  {'name': 'a4Preproc', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Classif', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Core', 'version': '', 'url': None, 'dependencies': []},
  {'name': 'a4Reporting', 'version': '', 'url': None, 'dependencies': []}]}

Obtain the transitive dependencies of the package *a4*

In [10]:

a4_G = bioconductor_pm_multiple.get_dependency_network("a4", deep_level=3, generate=True)
a4_G.nodes()

NodeView(('a4', 'a4Base', 'a4Preproc', 'a4Classif', 'a4Core', 'a4Reporting', 'methods', 'graphics', 'grid', 'Biobase', 'annaffy', 'mpm', 'genefilter', 'limma', 'multtest', 'glmnet', 'gplots', 'BiocGenerics', 'stats', 'R', 'utils', 'BiocManager', 'GO.db', 'AnnotationDbi', 'DBI', 'MatrixGenerics', 'annotate', 'survival', 'grDevices', 'MASS', 'stats4', 'ROCR', 'pamr', 'varSelRF', 'xtable'))