# Convert SMILES/HELMS into images

In [36]:
import os
from rdkit import Chem
from rdkit.Chem import Draw
from joblib import Parallel, delayed
from tqdm import tqdm

def smile_to_image(smile, path, img_size=(250, 250)):
    mol = Chem.MolFromSmiles(smile)
    draw = Draw.MolToFile(mol, path, size=img_size)

def convert_smi_to_img(df, out_dir, img_size=(250, 250)):
    smiles = list(df.smi)
    ids = list(df.index)
    
    img_paths = [os.path.join(out_dir, f'smi_images/{id}.png') for id in ids]

    os.makedirs(os.path.join(out_dir, 'smi_images'), exist_ok=True)
    Parallel(n_jobs=os.cpu_count())(delayed(smile_to_image)(s, p, img_size) for s, p in tqdm(zip(smiles, img_paths), cols=80))

    return img_paths

# CycPeptMPDB


In [37]:
import pandas as pd


# train = True
train = False

if train:
    df_cycpdb_all = pd.read_csv('data/CycPeptMPDB/all.csv')
    df_cycpdb_all['smi_img'] = convert_smi_to_img(df_cycpdb_all, 'data/CycPeptMPDB')

    from sklearn.model_selection import train_test_split

    df_cycpdb_train, df_cycpdb_test = train_test_split(df_cycpdb_all, test_size=0.2, random_state=42)

    df_cycpdb_all.to_csv('data/CycPeptMPDB/img_all.csv', index=True)
    df_cycpdb_train.to_csv('data/CycPeptMPDB/img_train.csv', index=True)
    df_cycpdb_test.to_csv('data/CycPeptMPDB/img_test.csv', index=True)
else:
    df_cycpdb_all = pd.read_csv('data/CycPeptMPDB/img_all.csv', index_col=0)
    df_cycpdb_train = pd.read_csv('data/CycPeptMPDB/img_train.csv', index_col=0)
    df_cycpdb_test = pd.read_csv('data/CycPeptMPDB/img_test.csv', index_col=0)

print(len(df_cycpdb_all), len(df_cycpdb_train), len(df_cycpdb_test))


7451 5960 1491
