In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import uuid
from pathlib import Path
import os
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Load TSV File as a DataFrame

In [2]:
project_name = "Testset"
types = defaultdict(lambda: "string", Risk="int")
df = pd.read_csv("./Testset.addseq.v2.tsv", sep="\t", dtype=types)
df

## Set Up BioNemo Framework

In [4]:
# Set up BIONEMO
try:
    BIONEMO_HOME: Path = Path(os.environ['BIONEMO_HOME']).absolute()
except KeyError:
    print("Must have BIONEMO_HOME set in the environment! See docs for instructions.")
    raise

config_path = BIONEMO_HOME / "examples" / "protein" / "esm1nv" / "conf"
print(f"Using model configuration at: {config_path}")
assert config_path.is_dir()

Using model configuration at: /workspace/bionemo/examples/protein/esm1nv/conf


Load the inference configuration

In [5]:
from bionemo.utils.hydra import load_model_config

cfg = load_model_config(config_name="infer.yaml", config_path=config_path)

Load the model specified in the config

In [6]:
from bionemo.triton.utils import load_model_for_inference
from bionemo.model.protein.esm1nv.infer import ESM1nvInference

inferer = load_model_for_inference(cfg, interactive=True)

print(f"Loaded a {type(inferer)}")
assert isinstance(inferer, ESM1nvInference)

INFO:rdkit:Enabling RDKit 2023.09.5 jupyter extensions


[NeMo I 2024-11-07 00:02:45 megatron_hiddens:110] Registered hidden transform sampled_var_cond_gaussian at bionemo.model.core.hiddens_support.SampledVarGaussianHiddenTransform
[NeMo I 2024-11-07 00:02:45 megatron_hiddens:110] Registered hidden transform interp_var_cond_gaussian at bionemo.model.core.hiddens_support.InterpVarGaussianHiddenTransform
[NeMo I 2024-11-07 00:02:45 utils:514] pytorch DDP is not initialized. Initializing with pytorch-lightening...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[NeMo I 2024-11-07 00:02:45 utils:360] Restoring model from /workspace/bionemo/models/protein/esm1nv/esm1nv.nemo
[NeMo I 2024-11-07 00:02:45 utils:364] Loading model class: bionemo.model.protein.esm1nv.esm1nv_model.ESM1nvModel


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Interactive mode selected, using strategy='auto'
[NeMo I 2024-11-07 00:02:45 exp_manager:396] Experiments will be logged at /workspace/bionemo/examples/protein/esm1nv/nbs/nemo_experiments/ESM1nv_Inference/2024-11-07_00-02-45
[NeMo I 2024-11-07 00:02:45 exp_manager:842] TensorboardLogger has been set up
[NeMo I 2024-11-07 00:02:45 utils:333] 
    
    ************** Trainer configuration ***********
[NeMo I 2024-11-07 00:02:45 utils:334] 
    name: ESM1nv_Inference
    desc: Minimum configuration for initializing a ESM1nv model for inference.
    trainer:
      precision: 16-mixed
      devices: 1
      num_nodes: 1
      accelerator: gpu
      logger: false
    exp_manager:
      explicit_log_dir: null
      exp_dir: null
      name: ${name}
      create_checkpoint_callback: false
    model:
      micro_batch_size: ${model.data.batch_size}
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      seq_length: 512
      max_position_embeddings: 512
      encoder_seq

[NeMo W 2024-11-07 00:02:45 save_restore_connector:394] src path does not exist or it is not a path in nemo file. src value I got was: /tokenizers/vocab/protein_sequence_sentencepiece.vocab. Absolute: /tokenizers/vocab/protein_sequence_sentencepiece.vocab
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name

[NeMo I 2024-11-07 00:02:45 megatron_init:251] Rank 0 has data parallel group : [0]
[NeMo I 2024-11-07 00:02:45 megatron_init:257] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-11-07 00:02:45 megatron_init:262] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-11-07 00:02:45 megatron_init:265] Ranks 0 has data parallel rank: 0
[NeMo I 2024-11-07 00:02:45 megatron_init:282] Rank 0 has context parallel group: [0]
[NeMo I 2024-11-07 00:02:45 megatron_init:285] All context parallel group ranks: [[0]]
[NeMo I 2024-11-07 00:02:45 megatron_init:286] Ranks 0 has context parallel rank: 0




[NeMo I 2024-11-07 00:02:45 megatron_init:297] Rank 0 has model parallel group: [0]
[NeMo I 2024-11-07 00:02:45 megatron_init:298] All model parallel group ranks: [[0]]
[NeMo I 2024-11-07 00:02:45 megatron_init:308] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-11-07 00:02:45 megatron_init:312] All tensor model parallel group ranks: [[0]]
[NeMo I 2024-11-07 00:02:45 megatron_init:313] Rank 0 has tensor model parallel rank: 0
[NeMo I 2024-11-07 00:02:45 megatron_init:342] Rank 0 has pipeline model parallel group: [0]
[NeMo I 2024-11-07 00:02:45 megatron_init:354] Rank 0 has embedding group: [0]
[NeMo I 2024-11-07 00:02:45 megatron_init:360] All pipeline model parallel group ranks: [[0]]
[NeMo I 2024-11-07 00:02:45 megatron_init:361] Rank 0 has pipeline model parallel rank 0
[NeMo I 2024-11-07 00:02:45 megatron_init:362] All embedding group ranks: [[0]]
[NeMo I 2024-11-07 00:02:45 megatron_init:363] Rank 0 has embedding rank: 0


24-11-07 00:02:45 - PID:1181 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 1
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 

[NeMo I 2024-11-07 00:02:45 tokenizer_utils:191] Getting SentencePiece with model: /tmp/tmp2opp5leo/f9e5097b22ec4aac8849955c30fdb1c3_protein_sequence_sentencepiece.model
[NeMo I 2024-11-07 00:02:45 megatron_base_model:544] Padded vocab_size: 128, original vocab_size: 30, dummy tokens: 98.


[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-07 00:02:45 megatron_base_model:1109] The model: ESM1nvModel() does not have field.name: gradient_accumulation_fusion in its cfg

[NeMo I 2024-11-07 00:02:45 nlp_overrides:1140] Model ESM1nvModel was successfully restored from /workspace/bionemo/models/protein/esm1nv/esm1nv.nemo.
Loaded a <class 'bionemo.model.protein.esm1nv.infer.ESM1nvInference'>


## Helper Functions
find_con will find the first number in a string

get_window will create a window for a certain idx of size

In [7]:
import re
def find_con(s):
    result = re.search('\d+', s)
    return result.group(0) if result else result

In [8]:
def get_window(arr, idx, size):
    a = arr
    idx = idx
    N = size
    if N % 2: # if window length is odd
        step = N // 2
    else: # if window length is even
        step = int(N/2 - 1)
    
    # make sure starting index is between 0 and a.shape[0] - N
    start = min(max(idx-step,0),len(a) - N)
    window = a[start:start + N]
    return window

## Generate a new column called UUID

In [9]:
# Generate uuid
df['uuid'] = df.apply(lambda _: uuid.uuid4(), axis=1)

In [10]:
df['uuid'] = df['uuid'].astype("str")

In [11]:
df.columns

Index(['#CHROM', 'POS', 'VariationID', 'REF', 'ALT', 'AlleleID', 'GeneSymbol',
       'GeneID', 'CLNHGVS', 'CLNSIG', 'CLNVC', 'Type', 'Name', 'RS# (dbSNP)',
       'NucleotideExpression', 'NucleotideChange', 'ProteinChange', 'CDS',
       'mCDS', 'Peptide', 'mPeptide', 'Dataset', 'uuid'],
      dtype='object')

## Save the DataFrame as a .feather file

In [12]:
df.to_feather(f"{project_name}_uuid.feather")

## Finally, load the file and perform inference

In [13]:
df = pd.read_feather(f"{project_name}_uuid.feather")
os.makedirs(f"./{project_name}", exist_ok=True)

In [23]:
for index, row in df.iterrows():
    try:
        win = int(find_con(row["ProteinChange"]))
    except Exception:
        win = 0
        print(row["ProteinChange"], row['uuid'])
    a = get_window(row["mPeptide"], win, 512)[:-2]
    if not Path(f"./{project_name}/{row['uuid']}.npy").is_file():
        try:
            res = inferer.seq_to_embeddings([a]) # Inference
        except Exception:
            print("API Error:", row["ProteinChange"], row['uuid'])
            continue
        np.save(f"./{project_name}/{row['uuid']}", res[0].cpu().numpy()) # Save as NPY