# Compare workspace and cluster packages

This notebook will help check and compare package versions on the workspace and cluster compute environments.
* Useful when mysterious stack traces appear with errors like "unexpected keyword argument" coming from deep inside some package code. Often these may be caused by some dependency package having a mismatch in version between workspace and cluster.
* Useful to identify flat-out missing packages in the cluster environment. These errors are usually a bit more obvious in the stack trace, something like "module not found" despite being able to import the package just fine in the workspace.
* When doing extended troubleshooting with multiple environment revisions, it can be very useful to save this output to a file with a good naming scheme (including environment revision numbers or dates) and some commentary about the current attempt.

Note that Ray itself must be installed correctly and be able to connect to the cluster for this notebook to function as intended.
Some package differences between workspace and cluster environments are expected, as most packages are not relevant to the Ray cluster.

In [None]:
import ray
import os
import pandas as pd

if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

In [None]:
def dump_packages():
    pkgs = {}
    for dist in __import__('pkg_resources').working_set:
        pkgs[dist.project_name] = dist.version
    return pkgs

@ray.remote
def dump_packages_ray():
    return dump_packages()

In [None]:
def package_comparison_report(save_file_path = None, save_file_message = 'no comments'):
    local_pkgs = dump_packages()
    cluster_pkgs = ray.get(dump_packages_ray.remote())
    all_pkgs = pd.DataFrame.from_dict(local_pkgs, orient='index', columns=['local_version']).join(
        pd.DataFrame.from_dict(cluster_pkgs, orient='index', columns=['cluster_version']),
        how='outer')
    all_pkgs['status'] = pd.Categorical( len(all_pkgs)*['mismatch'], categories = [
        'mismatch',
        'matching',
        'cluster missing',
        'local missing'
    ])
    all_pkgs['status'][pd.isna(all_pkgs['local_version'])] = 'local missing'
    all_pkgs['status'][pd.isna(all_pkgs['cluster_version'])] = 'cluster missing'
    all_pkgs['status'][all_pkgs['local_version'] == all_pkgs['cluster_version']] = 'matching'
    all_pkgs.sort_index(inplace=True)
    if save_file_path is not None:
        with open(save_file_path, 'w') as f:
            f.write('#' + save_file_message + '\n')
            all_pkgs.to_csv(f, header=True)
    return all_pkgs

In [None]:
# Check packages and write results to file
pkg_dump_file = '/mnt/troubleshooting/check_packages.csv'
pkg_dump_msg = 'Add comments here'
pkgs = package_comparison_report(save_file_path = pkg_dump_file, save_file_message = pkg_dump_msg)

In [None]:
# Inspect the resulting csv file however you like; directly, or via pandas, or via shell commands, etc
pkgs = pd.read_csv(pkg_dump_file, comment='#', index_col=0)
pkgs[pkgs['status'] == 'mismatch']

In [None]:
!grep ray $pkg_dump_file