In [0]:
%pip install rdkit ipywidgets
%restart_python

In [0]:
%sh
# https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html#zinc15-datasets
cd data
wget -nc https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/zinc15_250K_2D.tar.gz
tar -xf zinc15_250K_2D.tar.gz
rm data/zinc15_250K_2D.tar.gz

In [0]:
abs_data_path = os.path.abspath("data/zinc15_250K_2D.csv")
abs_data_path

In [0]:
df = (spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(f"file:/{abs_data_path}"))
display(df.limit(10))

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("healthcare_lifesciences.qsar.zinc15_250K")

In [0]:
df.count()

In [0]:
import pandas as pd
import numpy as np
import rdkit
from rdkit.Chem import Descriptors, MolFromSmiles, AllChem, DataStructs
from rdkit.Chem.rdchem import Mol
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, FloatType
from typing import Dict, Iterator, List, Optional
import re

## Compute ECFP

In [0]:
# Test for a single smiles
fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=1024)

def smiles_to_ecfp(smiles: str, fpgen: rdkit.Chem.rdFingerprintGenerator.FingerprintGenerator64) -> np.array:
    from rdkit.Chem import MolFromSmiles
    mol = MolFromSmiles(smiles)
    return fpgen.GetFingerprintAsNumPy(mol)

smiles_to_ecfp("C1=Cc2ccccc2NN=C1", fpgen)

In [0]:
@pandas_udf(ArrayType(FloatType()))
def udf_smiles_to_ecfp(smiles: Iterator[pd.Series]) -> Iterator[pd.Series]:
    fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=1024)
    for batch in smiles:
        results = []
        for smi in batch:
            desc_dict = smiles_to_ecfp(smi, fpgen)
            results.append(desc_dict)
        yield pd.Series(results)

## Compute RDkit descriptors

In [0]:
schema = StructType([StructField(name, DoubleType(), True) for name, _ in Descriptors.descList])
schema_string = ', '.join([f"{name} float" for name, _ in Descriptors.descList])

def smiles_to_desc(smiles: str, desc: Optional[List[str]] = None):
    from rdkit.Chem import Descriptors, MolFromSmiles
    mol = MolFromSmiles(smiles)
    # desc option does not work
    if desc:
        calculator = Descriptors.Properties(desc)
        return calculator.CalcDescriptors(mol)
    else: #all descriptors
        return Descriptors.CalcMolDescriptors(mol)

@pandas_udf(schema_string)
def udf_smiles_to_desc(smiles: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
    for batch in smiles:
        results = []
        for smi in batch:
            desc_dict = smiles_to_desc(smi)
            results.append(desc_dict)
        yield pd.DataFrame(results)

In [0]:
df = df.repartition(32)

In [0]:
df_desc = (df
    .withColumn("ecfp", udf_smiles_to_ecfp("smiles"))
    .withColumn("descriptors", udf_smiles_to_desc("smiles"))
)
display(df_desc.limit(10))

In [0]:
# https://datagrok.ai/help/datagrok/solutions/domains/chem/descriptors
desc = [desc for desc, _ in Descriptors.descList]
unselect_regex = re.compile(r"^Max|^Min|^MolWt$|^FpDensityMorgan|^BCUT2D|Ipc$|AvgIpc|BalabanJ|BertzCT|^Chi|^Kappa|LabuteASA|^PEOE_|^SMR_|^SlogP_|EState|VSA_EState|MolLogP|MolMR|HallKier")
selected_desc = [d for d in desc if not unselect_regex.match(d)]
selected_desc

In [0]:
selected_columns = df.columns + ["descriptors." + i for i in selected_desc] + ['ecfp']
selected_columns

In [0]:
display(df_desc.select(selected_columns).limit(5))

In [0]:
df_desc.select(selected_columns).write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("healthcare_lifesciences.qsar.zinc15_250k_full")

In [0]:
%sql
ALTER TABLE healthcare_lifesciences.qsar.zinc15_250k_full SET TBLPROPERTIES (delta.enableChangeDataFeed = true)

## Subset columns for genie

In [0]:
df_desc.select(selected_columns).drop("tranche_name", "ecfp").write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("healthcare_lifesciences.qsar.zinc15_250k_genie")