# Developping the multiview dataset, datamodule, model and loss

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from pathlib import Path

import pandas as pd
import seaborn as sns
import torch
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf, open_dict

from src.coati.models.io import load_e3gnn_smiles_clip_e2e
from src.modules.collate_fn import default_collate
from src.modules.molecules.coati import COATI

In [None]:
for i in range(1, 4):
    if not Path(f"../cpjump{i}/jump/").exists():
        print(f"Mounting cpjump{i}...")
        os.system(f"sshfs bioclust:/projects/cpjump{i}/ ../cpjump{i}")
    else:
        print(f"cpjump{i} already mounted.")

Mounting cpjump1...
Mounting cpjump2...
Mounting cpjump3...


## Dataset

## Datamodule

## Module

## Loss

## Check configs

In [None]:
# GlobalHydra.instance().clear()

In [None]:
initialize(version_base=None, config_path="../configs")

hydra.initialize()

In [None]:
cfg = compose(
    config_name="train.yaml",
    overrides=[
        "evaluate=true",
        "eval=hint",
        "paths.projects_dir=..",
        "paths.output_dir=./tmp/21312FS12A",
        "experiment=coati/small",
        "data.batch_size=32",
        # "model/molecule_encoder=gin_masking.yaml",
        "trainer.devices=1",
        # "eval.moa_image_task.datamodule.data_root_dir=../",
    ],
)
print(OmegaConf.to_yaml(cfg))

task_name: train
tags:
- small_jump_cl
- nlp_coati
- clip_like
- ${model.molecule_encoder.pretrained_name}
- ${model.image_encoder.instance_model_name}
train: true
test: true
evaluate: true
compile: false
ckpt_path: null
seed: 12345
data:
  compound_transform:
    _target_: src.modules.compound_transforms.coati.COATITransform
    compound_str_type: inchi
  _target_: src.models.jump_cl.datamodule.BasicJUMPDataModule
  batch_size: 32
  num_workers: 24
  pin_memory: null
  prefetch_factor: 3
  drop_last: true
  transform:
    _target_: src.modules.transforms.DefaultJUMPTransform
    _convert_: object
    size: 128
    dim:
    - -2
    - -1
  force_split: false
  splitter:
    _target_: src.splitters.ScaffoldSplitter
    train: 1024
    test: 256
    val: 128
    retrieval: 0
  use_compond_cache: false
  data_root_dir: ${paths.projects_dir}/
  split_path: ${paths.split_path}/fp_small3/
  dataloader_config:
    train:
      batch_size: ${data.batch_size}
      num_workers: ${data.num_worke

In [None]:
dm = instantiate(cfg.data)



In [None]:
dm.prepare_data()
dm.setup("fit")

In [None]:
dl = dm.train_dataloader()



In [None]:
b = next(iter(dl))

In [None]:
model = instantiate(cfg.model)

Loading model from s3://terray-public/models/grande_closed.pkl
Loading tokenizer may_closedparen from s3://terray-public/models/grande_closed.pkl
number of parameters: 12.64M
number of parameters Total: 2.44M xformer: 17.92M Total: 20.36M 
vocab_name not found in tokenizer_vocabs, trying to load from file


In [None]:
model

BasicJUMPModule(
  (image_encoder): CNNEncoder(
    (backbone): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (drop_block): Identity()
          (act1): ReLU(inplace=True)
          (aa): Identity()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act2): ReLU(inplace=True)
        )
        (1): BasicBlock(
      