# Uni-Mol Molecular Representation

**Licenses**

Copyright (c) DP Technology.

This source code is licensed under the MIT license found in the
LICENSE file in the root directory of this source tree.

**Citations**

Please cite the following papers if you use this notebook:

- Gengmo Zhou, Zhifeng Gao, Qiankun Ding, Hang Zheng, Hongteng Xu, Zhewei Wei, Linfeng Zhang, Guolin Ke. "[Uni-Mol: A Universal 3D Molecular Representation Learning Framework.](https://chemrxiv.org/engage/chemrxiv/article-details/6318b529bada388485bc8361)"
ChemRxiv (2022)

In [None]:
import os
import numpy as np
import pandas as pd
import lmdb
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import pickle
import glob

### Your SMILES list

In [None]:
smi_list = [
'CC1=C(C(=O)OC2CCCC2)[C@H](c2ccccc2OC(C)C)C2=C(O)CC(C)(C)CC2=[N+]1',
'COc1cccc(-c2nc(C(=O)NC[C@H]3CCCO3)cc3c2[nH]c2ccccc23)c1',
'O=C1c2ccccc2C(=O)c2c1ccc(C(=O)n1nc3c4c(cccc41)C(=O)c1ccccc1-3)c2[N+](=O)[O-]',
'COc1cc(/C=N/c2nonc2NC(C)=O)ccc1OC(C)C',
'CCC[C@@H]1CN(Cc2ccc3nsnc3c2)C[C@H]1NS(C)(=O)=O',
'CCc1nnc(N/C(O)=C/CCOc2ccc(OC)cc2)s1',
'CC(C)(C)SCCN/C=C1\C(=O)NC(=O)N(c2ccc(Br)cc2)C1=O',
'CC(C)(C)c1nc(COc2ccc3c(c2)CCn2c-3cc(OCC3COCCO3)nc2=O)no1',
'N#CCCNS(=O)(=O)c1ccc(/C(O)=N/c2ccccc2Oc2ccccc2Cl)cc1',
'O=C(Nc1ncc(Cl)s1)c1cccc(S(=O)(=O)Nc2ccc(Br)cc2)c1',
]

### Generate conformations from SMILES and save to .lmdb

In [None]:
def smi2coords(smi, seed):
    mol = Chem.MolFromSmiles(smi)
    mol = AllChem.AddHs(mol)
    atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
    coordinate_list = []
    res = AllChem.EmbedMolecule(mol, randomSeed=seed)
    if res == 0:
        try:
            AllChem.MMFFOptimizeMolecule(mol)
        except:
            pass
        coordinates = mol.GetConformer().GetPositions()
    elif res == -1:
        mol_tmp = Chem.MolFromSmiles(smi)
        AllChem.EmbedMolecule(mol_tmp, maxAttempts=5000, randomSeed=seed)
        mol_tmp = AllChem.AddHs(mol_tmp, addCoords=True)
        try:
            AllChem.MMFFOptimizeMolecule(mol_tmp)
        except:
            pass
        coordinates = mol_tmp.GetConformer().GetPositions()
    assert len(atoms) == len(coordinates), "coordinates shape is not align with {}".format(smi)
    coordinate_list.append(coordinates.astype(np.float32))
    return pickle.dumps({'atoms': atoms, 'coordinates': coordinate_list, 'smi': smi}, protocol=-1)

def write_lmdb(smiles_list, job_name, seed=42, outpath='./results'):
    os.makedirs(outpath, exist_ok=True)
    output_name = os.path.join(outpath,'{}.lmdb'.format(job_name))
    try:
        os.remove(output_name)
    except:
        pass
    env_new = lmdb.open(
        output_name,
        subdir=False,
        readonly=False,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=1,
        map_size=int(100e9),
    )
    txn_write = env_new.begin(write=True)
    for i, smiles in tqdm(enumerate(smiles_list)):
        inner_output = smi2coords(smiles, seed=seed)
        txn_write.put(f"{i}".encode("ascii"), inner_output)
    txn_write.commit()
    env_new.close()

In [None]:
seed = 42
job_name = 'get_mol_repr'   # replace to your custom name
data_path = './results'  # replace to your data path
weight_path='../ckp/mol_pre_no_h_220816.pt'  # replace to your ckpt path
only_polar=0  # no h
dict_name='dict.txt'
batch_size=16
results_path=data_path   # replace to your save path
write_lmdb(smi_list, job_name=job_name, seed=seed, outpath=data_path)

### Infer from ckpt

In [None]:
# NOTE: Currently, the inference is only supported to run on a single GPU. You can add CUDA_VISIBLE_DEVICES="0" before the command.
!cp ../example_data/molecule/$dict_name $data_path
!CUDA_VISIBLE_DEVICES="0" python ../unimol/infer.py --user-dir ../unimol $data_path --valid-subset $job_name \
       --results-path $results_path \
       --num-workers 8 --ddp-backend=c10d --batch-size $batch_size \
       --task unimol --loss unimol_infer --arch unimol_base \
       --path $weight_path \
       --fp16 --fp16-init-scale 4 --fp16-scale-window 256 \
       --only-polar $only_polar --dict-name $dict_name \
       --log-interval 50 --log-format simple --random-token-prob 0 --leave-unmasked-prob 1.0 --mode infer

### Read .pkl and save results to .csv

In [None]:
def get_csv_results(predict_path, results_path):
    predict = pd.read_pickle(predict_path)
    smi_list, mol_repr_list, pair_repr_list = [], [], []
    for batch in predict:
        sz = batch["bsz"]
        for i in range(sz):
            smi_list.append(batch["data_name"][i])
            mol_repr_list.append(batch["mol_repr_cls"][i])
            pair_repr_list.append(batch["pair_repr"][i])
    predict_df = pd.DataFrame({"SMILES": smi_list, "mol_repr": mol_repr_list, "pair_repr": pair_repr_list})
    print(predict_df.head(1),predict_df.info())
    predict_df.to_csv(results_path+'/mol_repr.csv',index=False)

pkl_path = glob.glob(f'{results_path}/*_{job_name}.out.pkl')[0]
get_csv_results(pkl_path, results_path)