A notebook to compare results from retriving motifdb from m2lda and from local dump

In [1]:
import sys, os

sys.path.append('..')

Retrieving data from ms2lda

In [2]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

# Example of creating an `args` object with user preferences
args = Args(
    gnps_motif_include="yes",
    massbank_motif_include="no",
    urine_motif_include="no",
    euphorbia_motif_include="no",
    rhamnaceae_motif_include="no",
    strep_salin_motif_include="no",
    photorhabdus_motif_include="no",
    user_motif_sets=None
)

In [3]:
from pySubstructures.motifdb.main import acquire_motifsets

In [4]:
motifdb_spectra, motifdb_metadata, motifdb_features = acquire_motifsets(args)
len(motifdb_spectra), len(motifdb_metadata), len(motifdb_features)

(77, 77, 3644)

Second method of loading the dumped motif sets

In [5]:
from importlib import resources as impresources
from pySubstructures import resources

RESOURCE_DIR = impresources.files(resources)
RESOURCE_DIR

MultiplexedPath('/Users/joewandy/Work/git/pySubstructures/pySubstructures/resources')

In [6]:
from pySubstructures.motifdb.constants import GNPS_LIBRARY_DERIVED_MASS2MOTIFS
from pySubstructures.motifdb.main import load_db

In [7]:
db_path = os.path.abspath(RESOURCE_DIR / 'MOTIFDB')
db_list = [
    GNPS_LIBRARY_DERIVED_MASS2MOTIFS
]
db_path, db_list

('/Users/joewandy/Work/git/pySubstructures/pySubstructures/resources/MOTIFDB',
 ['GNPS library derived Mass2Motifs'])

In [8]:
loaded_spectra, loaded_metadata, loaded_features = load_db(db_list, db_path)

Looking in /Users/joewandy/Work/git/pySubstructures/pySubstructures/resources/MOTIFDB/GNPS library derived Mass2Motifs/*.m2m
	 Found 77
Found total of 77 motif files


In [9]:
len(loaded_spectra), len(loaded_metadata), len(loaded_features)

(77, 77, 3644)

Compare to make sure they're the same

In [10]:
assert len(motifdb_spectra) == len(loaded_spectra), "Spectra lengths differ"
assert len(motifdb_metadata) == len(loaded_metadata), "Metadata lengths differ"
assert len(motifdb_features) == len(loaded_features), "Features lengths differ"

print("All lengths are identical. Proceeding to content comparison...")

All lengths are identical. Proceeding to content comparison...


In [11]:
# Assert all keys in motifdb_spectra are in loaded_spectra and their values are identical
for key, value in motifdb_spectra.items():
    assert key in loaded_spectra, f"Key {key} not found in loaded_spectra."
    assert value == loaded_spectra[key], f"Value mismatch for key {key} between motifdb_spectra and loaded_spectra."

In [12]:
# List of keys to ignore during comparison
ignore_keys = ['motifdb_id', 'motifdb_url', 'merged']

# Assert all keys in motifdb_metadata are in loaded_metadata and their values are identical, except ignored keys
for key, value in motifdb_metadata.items():
    assert key in loaded_metadata, f"Key {key} not found in loaded_metadata."
    for sub_key in value:
        if sub_key not in ignore_keys:  # Skip comparison for ignored keys
            assert sub_key in loaded_metadata[key], f"Sub-key {sub_key} not found in loaded_metadata[{key}]."
            assert value[sub_key] == loaded_metadata[key][sub_key], f"Mismatch for sub-key {sub_key} in key {key}."

In [13]:
# Assert motifdb_features and loaded_features are identical
assert motifdb_features == loaded_features, "Mismatch in motifdb_features and loaded_features."

Checking why some keys are present in the online results but not in the dump versions.
In particular, these are: ['motifdb_id', 'motifdb_url', 'merged'] (the ignore keys in the assert above).

In [14]:
# key_to_compare = 'gnps_motif_0.m2m'
# 
# # Ensure the key exists in both metadata dictionaries
# if key_to_compare in motifdb_metadata and key_to_compare in loaded_metadata:
#     # Iterate through all sub-keys in the motifdb_metadata for the specific motif
#     for sub_key in motifdb_metadata[key_to_compare]:
#         motifdb_value = motifdb_metadata[key_to_compare].get(sub_key, "Not Present")
#         loaded_value = loaded_metadata[key_to_compare].get(sub_key, "Not Present")
#         # Check for mismatch and print
#         if motifdb_value != loaded_value:
#             print(f"Mismatch found for sub-key: {sub_key}")
#             print(f"  motifdb_metadata value: {motifdb_value}")
#             print(f"  loaded_metadata value: {loaded_value}")
#         else:
#             print(f"Match found for sub-key: {sub_key}")
#             print(f"  Both values: {motifdb_value}")
# else:
#     print(f"Key '{key_to_compare}' not found in one of the dictionaries.")


In [15]:
# key_to_inspect = 'gnps_motif_38.m2m'
# 
# # Checking if the key exists in both dictionaries
# if key_to_inspect in motifdb_metadata and key_to_inspect in loaded_metadata:
#     print(f"Inspecting differences for '{key_to_inspect}'...\n")
#     
#     # Extracting sub-keys from both dictionaries for the specific key
#     motifdb_sub_keys = set(motifdb_metadata[key_to_inspect].keys())
#     loaded_sub_keys = set(loaded_metadata[key_to_inspect].keys())
#     
#     # Finding the union of sub-keys to ensure no key is missed
#     all_sub_keys = motifdb_sub_keys.union(loaded_sub_keys)
#     
#     # Iterating over each sub-key to print values side by side
#     for sub_key in sorted(all_sub_keys):  # Sorted for consistent order
#         motifdb_value = motifdb_metadata[key_to_inspect].get(sub_key, "Not Present")
#         loaded_value = loaded_metadata[key_to_inspect].get(sub_key, "Not Present")
#         
#         # Printing values side by side
#         print(f"Sub-key: {sub_key}")
#         print(f"  motifdb_metadata: {motifdb_value}")
#         print(f"  loaded_metadata: {loaded_value}\n")
# else:
#     print(f"Key '{key_to_inspect}' not found in one of the dictionaries.")
