In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset
from datasets import Dataset

In [3]:
import pandas as pd
import datamol as dm
import dask
import dask.distributed as dd
import dask.dataframe as ddf

In [5]:
unichem = pd.read_csv("/storage/shared_data/manu/unichem.tsv.gz", sep="\t", engine="pyarrow")

In [6]:
unichem = unichem.rename(columns=dict(STANDARDINCHI="inchi", STANDARDINCHIKEY="inchikey", UCI="id"))

In [7]:
unichem

Unnamed: 0,id,inchi,inchikey
0,2151003,InChI=1S/C18H33N3O3/c1-12(2)10-16(23)20-17(13(...,GPODNYXNIDSMQH-QGZVFWFLSA-N
1,4440473,InChI=1S/C21H21FN2O2S2/c1-2-27-16-6-3-5-14(13-...,RSWXSSYVOOWIEE-UHFFFAOYSA-N
2,5301143,InChI=1S/C18H15N7OS2/c1-12-21-23-17(28-12)20-1...,XRFZVGUPNZXOCQ-UHFFFAOYSA-N
3,7973252,InChI=1S/C18H26N2O3S/c1-5-8-9-10-18(21)19-15(4...,XYBQNOZXTZLFET-BCNJTZCHSA-N
4,8331947,InChI=1S/C20H18N2O4/c1-14(15-7-3-2-4-8-15)22-2...,IDNXMURNGDUUPN-SWEABUAFSA-M
...,...,...,...
178083701,183684662,InChI=1S/C21H21BrF3N3O6/c1-32-12-15(20(30)31)1...,WVKBHCBDZQCCRW-NTCAYCPXSA-N
178083702,185400182,InChI=1S/C20H36O5/c1-14(2)8-5-9-15(3)10-6-11-1...,DNYNTPITISCIRZ-UHFFFAOYSA-N
178083703,186836143,InChI=1S/C22H34N4O2/c1-17(2)22(18(3)4)20(27)26...,FIOASZRTVBIGDI-UHFFFAOYSA-N
178083704,187186204,"InChI=1S/C22H34O5/c1-21(2,20(25)26)14-7-5-10-1...",MSZHYKOTINDQOI-UHFFFAOYSA-N


In [8]:
unichem["source"] = "unichem"

In [9]:
def as_smiles(inchi_list):        
    with dm.without_rdkit_log():
        if isinstance(inchi_list, str):
            mol = dm.from_inchi(inchi_list)
            return dm.to_smiles(mol)
        mols = [dm.from_inchi(inchi) for inchi in inchi_list]
        return [dm.to_smiles(mol) for mol in mols]

In [10]:
unichem["smiles"] = dm.parallelized_with_batches(as_smiles, unichem.inchi, batch_size=10000, total=unichem.shape[0], n_jobs=64, tqdm_kwargs=dict(leave=True, disable=False))

  0%|          | 0/17808 [00:00<?, ?it/s]

In [12]:
import tqdm
import numpy as np
n_partitions = 64

# Randomize the rows
permuted_indices = np.arange(unichem.shape[0])

# Init partitions
unichem["parquet_partition"] = None

# Make the partitions
for i in tqdm.tqdm(range(n_partitions), total=n_partitions):
    unichem.loc[
        permuted_indices[i::n_partitions], "parquet_partition"
    ] = f"PARTITION_{i:02d}"


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:16<00:00,  3.90it/s]


In [15]:
! pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2023.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m31m13.4 MB/s[0m eta [36m0:00:01[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.6.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.6.2 fastparquet-2023.7.0


In [35]:
unichem = unichem.drop("inchi", axis=1).drop_duplicates("inchikey").reset_index(drop=True)

In [36]:
unichem.to_parquet("/storage/shared_data/manu/unichem.parquet",  partition_cols="parquet_partition", engine="fastparquet")

In [19]:
drugchem = pd.read_csv("/storage/shared_data/manu/drugspace.smi.tar.gz", engine="pyarrow", sep='\t')

In [21]:
drugchem = drugchem.rename(columns={"SMILES":"smiles", "DE ID":"id"})

In [22]:
drugchem["source"] = "drugspacex"

In [26]:
def as_inchikey(sm_list):        
    with dm.without_rdkit_log():
        if isinstance(sm_list, str):
            return dm.to_inchikey(sm_list)
        return [dm.to_inchikey(sm) for sm in sm_list]
    

In [27]:
drugchem["inchikey"] = dm.parallelized_with_batches(as_inchikey, drugchem.smiles, batch_size=10000, total=drugchem.shape[0], n_jobs=72, tqdm_kwargs=dict(leave=True, disable=False))


  0%|          | 0/10094 [00:00<?, ?it/s]

In [29]:
drugchem = drugchem.drop_duplicates("inchikey")

In [32]:
drugchem =  drugchem.reset_index(drop=True)

In [33]:
import tqdm
import numpy as np
n_partitions = 64

# Randomize the rows
permuted_indices = np.arange(drugchem.shape[0])

# Init partitions
drugchem["parquet_partition"] = None

# Make the partitions
for i in tqdm.tqdm(range(n_partitions), total=n_partitions):
    drugchem.loc[
        permuted_indices[i::n_partitions], "parquet_partition"
    ] = f"PARTITION_{i:02d}"

drugchem.to_parquet("/storage/shared_data/manu/druspacex.parquet",  partition_cols="parquet_partition", engine="fastparquet")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:08<00:00,  7.30it/s]


---

### Apply transformation as SAFE strings to unichem. 

In [38]:
import safe as sf

In [48]:
sf.encode("c1ccccc(C)1", slicer="hr")

'c1ccccc12.C2'