In [3]:
# This notebook was created by Gennady
import os
import json
import awswrangler

from metabolomics.utils.spectrum import Spectra

from numerical_mz.factory import model_factory

In [2]:
work_dir = os.path.join(
    's3://', 'enveda-data-user', 'chloe.engler', 'cosine_similarity'
)
nist_source_path = os.path.join(work_dir, 'NIST_data/nist_df.parquet')
gnps_source_path = os.path.join(work_dir, 'Wout_data/wout_GNPS_df.parquet')
nist_target_path = os.path.join(work_dir, 'NIST_data/nist_df_w_siamese_vecs.parquet')
gnps_target_path = os.path.join(work_dir, 'Wout_data/wout_GNPS_df_w_siamese_vecs.parquet')

In [4]:
config_path = os.path.join(
    '/', 'efs', 'gennadyvoronov', 'spectral-similarity', 'numerical-mz',
    'model_config', 'siamese', 'dataset-7.0.0', 
    'transformer.depth6.width512.sin_mz.all.json',
)
with open(config_path) as fid:
    config = json.load(fid)

model = model_factory(
    config, 
    accelerator='gpu', 
    devices=1, 
    num_workers=16, 
    precision=32, 
    prefetch_factor=32, 
    batch_size=64
)
model.load_base_model()

In [6]:
nist_df = awswrangler.s3.read_parquet(nist_source_path)

spectra = Spectra.from_pandas(nist_df, peak_columns=['mz_values', 'intensites'])
data = model.get_data_loader(model.create_dataset(spectra))
vectors = model.eval_embd_vectors(data)

nist_df['siamese_vector'] = [
    [float(element) for element in vector] 
    for vector in vectors
]

_ = awswrangler.s3.to_parquet(nist_df, nist_target_path)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [8]:
gnps_df = awswrangler.s3.read_parquet(gnps_source_path)

spectra = Spectra.from_pandas(gnps_df, peak_columns=['mz_values', 'intensities'])
data = model.get_data_loader(model.create_dataset(spectra))
vectors = model.eval_embd_vectors(data)

gnps_df['siamese_vector'] = [
    [float(element) for element in vector] 
    for vector in vectors
]

_ = awswrangler.s3.to_parquet(gnps_df, gnps_target_path)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]