In [1]:
from transformole.model import *
import pytorch_lightning as pl
from transformole.config import DATA_PATH, LOG_PATH, GEN_PATH
from transformole.utils import *
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Sample dataset structure (should be DataLoader instances)
Tokenizer = SmilesTokenizer(load_vocab=True)
train_data = open(f"{DATA_PATH}/moses/train.csv").read().split("\n")
train_data = TensorDataset(*Tokenizer.encode(smiles_list=train_data, max_length=100, padding=True, truncation=True, return_tensors="pt"))
train_data = DataLoader(dataset=train_data, batch_size=128, shuffle=True, num_workers=19, persistent_workers=True)

Vocabulary loaded successfully.


Encoding SMILES: 100%|██████████| 1584664/1584664 [00:16<00:00, 97112.21it/s] 
Padding sequences: 100%|██████████| 1584664/1584664 [00:09<00:00, 168418.72it/s]


In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
# Phase 1: Full-parameter pretraining
pretrain_model = TransforMole(
        vocab_size=100,
        dim_model=256,
        use_lora=False
)
# Phase 1: Full-parameter pretraining
pretrain_logger = TensorBoardLogger(
    save_dir=LOG_PATH,
    name="pretrain",
    version="full_model"
)
pretrain_trainer = pl.Trainer(
    accelerator="auto",
    devices=1,
    precision="16-mixed",
    max_epochs=10,
    enable_progress_bar=True,
    logger=pretrain_logger
)
pretrain_trainer.fit(pretrain_model, train_data)

Using 16bit Automatic Mixed Precision (AMP)
D:\program\anaconda\envs\pytorch_cuda\Lib\site-packages\pytorch_lightning\plugins\precision\amp.py:52: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
D:\program\anaconda\envs\pytorch_cuda\Lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type               | Params | Mode 
-----------------------------------------------------------
0 | embedding   | Embedding          | 25.6 K | train
1 | pos_encoder | PositionalEncoding | 0      | train
2 | transformer | ModuleList         | 4.7 M  | train
3 | fc_out      | Linear             | 25.7 K | train
4 | loss_fn     | CrossEntropyLoss   | 0      | train
----

Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
FT_data = open(f"{DATA_PATH}/guacamol/train.csv").read().split("\n")
FT_data = Tokenizer.encode(smiles_list=FT_data, max_length=100, padding=True, truncation=True, return_tensors="pt", persistent_workers=True)

In [None]:
# Phase 2: LoRA Fine-tuning
finetune_model = TransforMole(
    vocab_size=100,
    dim_model=256,
    use_lora=True,
    lora_rank=8,
    lora_alpha=16
)
finetune_model.load_state_dict(pretrain_model.state_dict(), strict=False)
finetune_logger = TensorBoardLogger(
    save_dir=LOG_PATH,
    name="finetune",
    version="lora_adjusted"
)
finetune_trainer = pl.Trainer(
    accelerator="auto",
    devices=1,
    precision="16-mixed",
    max_epochs=5,
    enable_progress_bar=True,
    logger=finetune_logger,
)
finetune_trainer.fit(finetune_model, FT_data)

# Phase 3: Molecule Generation
finetune_model.generate(
    num_samples=100,
    max_length=90,
    output_dir=GEN_PATH
)