# Evaluate Python Libraries <a class="tocSkip">

**In this notebook:**

- Install all libraries from best-of list in virtual environments and track metrics
- Help select and evaluate libraries for installation in ml-workspace
- Check workspace installation

_Please use a Python 3 kernel for the notebook_

## Dependencies

### Install Dependencies

In [None]:
!pip install -q qgrid
!jupyter nbextension enable --py --sys-prefix qgrid
!pip install -q pur
!pip install -q best-of

In [None]:
!git clone https://github.com/ml-tooling/best-of-ml-python

### Import Dependencies

In [None]:
# System libraries
import logging, os, sys
import re, shutil
import subprocess
import time
import glob, os
import json
from collections import Counter

# Enable logging
logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.INFO, stream=sys.stdout)

#logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.DEBUG, stream=sys.stdout)
log = logging.getLogger()

# Re-import packages if they change
%load_ext autoreload
%autoreload 2

# Intialize tqdm to always use the notebook progress bar
from tqdm.autonotebook import tqdm as tqdm_notebook
import tqdm
tqdm.tqdm = tqdm_notebook

# Third-party libraries
import numpy as np
import pandas as pd

log = logging.getLogger()

# TODO: Need to set github and libraries.io API key 
os.environ["LIBRARIES_API_KEY"] = "<API_KEY>"
os.environ["GITHUB_API_KEY"] = "<API_KEY>"

## Utilities

In [None]:
def simplify(text) -> str:
    return re.compile(r"[^a-zA-Z0-9-]").sub("-", " ".join(str(text).split()).strip()).lower()

def call(command, **kwargs):
    log.debug("Executing: "+command)
    return subprocess.call(command, shell=True, **kwargs)

def get_folder_size(folder_path: str):
    return int(subprocess.check_output(['du', '-s', folder_path]).split()[0].decode('utf-8'))

def get_package_paths(venv_path: str) -> list:
    package_paths = []
    for file_path in glob.iglob(venv_path.rstrip("/") + '/lib64/python3.8/site-packages/**', recursive=False):
        if os.path.isdir(file_path): # filter dirs
            file_name = os.path.basename(file_path)
            if "dist-info" not in file_name:
                # Ignore dist-info folders
                continue
            
            
            if file_name.startswith("pip") or file_name.startswith("setuptools"):
                continue
            package_paths.append(file_path)
    return package_paths

def evaluate_libs(df_projects: pd.DataFrame, eval_root_path: str = "./eval-libs") -> pd.DataFrame and Counter:
    os.makedirs(eval_root_path, exist_ok=True)

    venv_root_path = os.path.join(eval_root_path, "venv")
    os.makedirs(venv_root_path, exist_ok=True)

    logs_root_path = os.path.join(eval_root_path, "logs")
    os.makedirs(logs_root_path, exist_ok=True)

    requirements_counter = Counter()

    df_projects_evaluated = df_projects.copy()
    for i, row in tqdm.tqdm(df_projects_evaluated.iterrows(), total=df_projects_evaluated.shape[0]):
        pypi_id = row["pypi_id"]
        if not pypi_id:
            continue
        
        #if "install_successful" in row:
        #    if str(row["install_successful"]):
        #        print("Already evaluated (skipping): " + pypi_id)
        #        continue
        
        print("Evaluating " + pypi_id)
    
        package_install_dir = os.path.join(venv_root_path, simplify(pypi_id))
        package_log_dir = os.path.join(logs_root_path, simplify(pypi_id))
        os.makedirs(package_log_dir, exist_ok=True)
    
        # Create virtual environment
        if call(sys.executable + " -m venv --system-site-packages --symlinks " + package_install_dir) > 0:
            log.warning("Failed to create venv for: " + pypi_id)
            continue
    
        initial_folder_size = get_folder_size(package_install_dir)
        start = time.time()
    
        source_venv_cmd = ". " + os.path.join(package_install_dir, "bin/activate")
        pip_install_cmd = "pip install --no-use-pep517 " + pypi_id
        if "pypi_version_spec" in row and row["pypi_version_spec"]:
            # if version spec is provided, try this specific version
            pip_install_cmd += str(row["pypi_version_spec"])
        
        install_log_file = open(os.path.join(package_log_dir, "install_logs.txt"), "w")
        # Source and run installation
        if call(source_venv_cmd + " && " + pip_install_cmd, stderr=subprocess.STDOUT, stdout=install_log_file) > 0:
            install_log_file.close()
            log.warning("Failed to install package: " + pypi_id)
            df_projects_evaluated.at[i, 'install_successful'] = False
            shutil.rmtree(package_install_dir)
            continue
    
        install_log_file.close()
    
        df_projects_evaluated.at[i, 'install_successful'] = True
    
        # seconds
        df_projects_evaluated.at[i, 'install_time'] = time.time() - start
        # kilobytes
        df_projects_evaluated.at[i, 'install_total_size'] = get_folder_size(package_install_dir) - initial_folder_size    # number of installed requirements
    
        requirements_count = len(get_package_paths(package_install_dir))
        try:
            package_count = 0
            for package in json.loads(subprocess.check_output(source_venv_cmd + " && pip list --local --format=json", shell=True).decode('utf-8')):
                if package["name"] not in ["pip", "setuptools"]:
                    requirements_counter[package["name"].lower().strip()] += 1
                    package_count += 1
            if package_count != requirements_count:
                log.warning("Pip list count (" + str(package_count) + ") is different to folder count (" + str(requirements_count) + ").")
                if package_count > requirements_count:
                    # if package count is higher, use package count
                    requirements_count = package_count
        except Exception as ex:
            log.warning("Failed to parse pip-list.", exc_info=ex)
            pass
    
        df_projects_evaluated.at[i, 'install_requirements'] = requirements_count
    
        # Create logs
        package_sizes_file = open(os.path.join(package_log_dir, "package_sizes.txt"), "w")
        package_size_command = "pip list --local | sed '/Package/d' | sed '/----/d' | sed -r 's/\S+//2' | xargs pip show | grep -E 'Location:|Name:' | cut -d ' ' -f 2 | paste -d ' ' - - | awk '{print $2 \"/\" $(find $2 -maxdepth 1 -iname $1)}' | xargs du -sh  | sort -rh"
        call(source_venv_cmd + " && " + package_size_command, stderr=subprocess.STDOUT, stdout=package_sizes_file)
        package_sizes_file.close()
    
        pipdeptree_file = open(os.path.join(package_log_dir, "pipdeptree.txt"), "w")
        call(source_venv_cmd + " && pip install -q pipdeptree && pipdeptree --local-only", stderr=subprocess.STDOUT, stdout=pipdeptree_file)
        pipdeptree_file.close()
    
        # Remove virtual environment
        shutil.rmtree(package_install_dir)
    shutil.rmtree(venv_root_path)
    return df_projects_evaluated, requirements_counter

## Evaluate Best-of Lists 

In [None]:
EVAL_ROOT_PATH = "/workspace/package-evals"
BEST_OF_LISTS = ["/workspace/best-of-ml-python/history/2020-11-30_projects.csv"]

# Combine all best-of-lists into one based on pypi_id
pypi_projects = {}
for project_csv_path in BEST_OF_LISTS:
    df_projects = pd.read_csv(project_csv_path, index_col=0, keep_default_na=False)
    for i, row in df_projects.iterrows():
        if "pypi_id" in row and row["pypi_id"]:
            pypi_id = simplify(row["pypi_id"])
            if pypi_id not in pypi_projects:
                pypi_projects[pypi_id] = row.to_dict()

df_projects = pd.DataFrame(pypi_projects.values())
df_projects.head()

In [None]:
# Evaluate
df_projects_evaluated, req_counter = evaluate_libs(df_projects, EVAL_ROOT_PATH)

### Evaluation Results

In [None]:
import qgrid
qgrid.show_grid(df_projects_evaluated[['pypi_id', "projectrank", "license", "install_successful", "install_total_size", "install_requirements", "install_time" ]])

### Most Common Dependencies

In [None]:
# Most common requirements
req_counter.most_common(100)

## Update Requirement Versions
Creates new requirements file 

In [None]:
REQUIREMENTS_PATH = '/resources/libraries/requirements-full.txt'
UPDATED_REQUIREMENTS_PATH = "./requirements-full-updated.txt"

from pur import update_requirements
update_requirements(input_file=REQUIREMENTS_PATH, 
                    output_file=UPDATED_REQUIREMENTS_PATH, 
                    echo=True,
                    force=True)

# TODO export update messages to log file:
# https://github.com/alanhamlett/pip-update-requirements

## Gather Requirement Information

In [None]:
import requirements
from best_of.integrations.pypi_integration import PypiIntegration
from best_of.integrations import github_integration
from addict import Dict

pypi_integration = PypiIntegration()

pypi_projects = {}
for i, row in df_projects_evaluated.iterrows():
    if "pypi_id" in row and row["pypi_id"]:
        pypi_projects[simplify(row["pypi_id"])] = row.to_dict()

requirements_projects = []
untracked_projects = []
projects = set()

with open(UPDATED_REQUIREMENTS_PATH, 'r') as myfile:
    for req in tqdm.tqdm(requirements.parse(myfile.read())):
        version_spec = None
        if req.specs:
            version_spec = ''.join(map(str,[i for sub in req.specs for i in sub]))
        
        simplified_name = simplify(req.name)
        if simplified_name in projects:
            print("Project " + req.name + " is duplicated.")
        
        projects.add(simplified_name)
        
        if simplified_name in pypi_projects:
            # Project is already in the project list
            project_info = pypi_projects[simplified_name]
            # add pypi_spec
            if version_spec:
                project_info["pypi_version_spec"] = version_spec
            requirements_projects.append(project_info)
            # remove project from dict -> we will only have projects that werent added
            del pypi_projects[simplified_name]
            continue
        project = Dict()
        project.pypi_id = req.name
        if version_spec:
            project.pypi_version_spec = version_spec

        pypi_integration.update_project_info(project)
        github_integration.update_via_github(project)
        
        requirements_projects.append(project.to_dict())
        untracked_projects.append(project.to_dict())

df_requirements = pd.DataFrame(requirements_projects)
df_untracked_projects = pd.DataFrame(untracked_projects)
df_potential_projects = pd.DataFrame(pypi_projects.values())

## Untracked Requirements
Requirements that are only in the requirements file but not on the best-of-list

In [None]:
import qgrid
qgrid.show_grid(df_untracked_projects[['pypi_id', 'github_id', "projectrank", "license"]])

## Evaluate Requirement Installs
This also uses the specific version used in the requirements file.

In [None]:
EVAL_REQUIREMENTS_ROOT_PATH = "/workspace/requirements-evals"
df_requirements_evaluated, req_counter = evaluate_libs(df_requirements, EVAL_REQUIREMENTS_ROOT_PATH)

### Evaluation Results

In [None]:
import qgrid
qgrid.show_grid(df_requirements_evaluated[['name', 'pypi_id', "projectrank", "license", "install_successful", "install_total_size", "install_requirements", "install_time" ]])

### Most Common Dependencies

In [None]:
# Most common requirements
req_counter.most_common(25)

## Potential Requirement Additions

In [None]:
import qgrid
qgrid.show_grid(df_potential_projects[['pypi_id', 'github_id', "projectrank", "license", "install_successful", "install_total_size", "install_requirements", "install_time" ]])

## Install Requirements File 

In [None]:
!pip install --upgrade -r $UPDATED_REQUIREMENTS_PATH > "./requirement-installs.txt"

## Check Library Sizes

In [None]:
!pip list | sed '/Package/d' | sed '/----/d' | sed -r 's/\S+//2' | xargs pip show | grep -E 'Location:|Name:' | cut -d ' ' -f 2 | paste -d ' ' - - | awk '{print $2 "/" $(find $2 -maxdepth 1 -iname $1)}' | xargs du -sh  | sort -rh

## Check Library Conflicts

In [None]:
!pip check

## Python Vulnerability Scan

In [None]:
!pip install -q safety
!safety check

## Generate Requirements Markdown