In [9]:
import pandas as pd
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import itertools
import dask

# Initialize a Dask LocalCluster with a specified number of workers
cluster = LocalCluster(n_workers=4, threads_per_worker=1)
client = Client(cluster)

# Load dataset
data = pd.read_csv('compounds.csv')

# Function to compute molecular fingerprint from SMILES
def compute_fingerprint(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    except:
        return None

print('Computing fingerprints...', end=' ')
# Compute fingerprints for all compounds
data['fingerprint'] = data['canonical_smiles'].apply(compute_fingerprint)
print('Done')

# Drop rows with None fingerprintsxw
data = data.dropna(subset=['fingerprint'])

# Convert to Dask DataFrame
ddf = dd.from_pandas(data, npartitions=4)

# Create a DataFrame of all pairwise combinations
print('Computing pairs...', end=' ')
pairs = pd.DataFrame(list(itertools.combinations(range(len(data)), 2)), columns=['i', 'j'])
pairs_ddf = dd.from_pandas(pairs, npartitions=4)
print('Done')

# Function to compute Tanimoto similarity for a pair
def compute_similarity(df, fingerprints):
    df['similarity'] = df.apply(lambda row: DataStructs.TanimotoSimilarity(
        fingerprints[row['i']], fingerprints[row['j']]), axis=1)
    return df

# Compute similarities in parallel
fingerprints = data['fingerprint'].tolist()
print('Computing partitions...', end=' ')
results_ddf = pairs_ddf.map_partitions(compute_similarity, fingerprints=fingerprints, meta={'i': int, 'j': int, 'similarity': float})
print('Done')

# Compute and collect the results
print('Computing...', end=' ')
results = results_ddf.compute()
print('Done')

# Extract the top-10 most similar pairs
top_10_similar = results.nlargest(10, 'similarity')


Perhaps you already have a cluster running?
Hosting the HTTP server on port 55578 instead


Computing fingerprints... Done
Computing pairs... Done
Computing partitions... Done
Computing... 

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Done


In [10]:
for index, row in top_10_similar.iterrows():
    # print('index: ', str(index))
    # print('row: ', str(row))
    print('('+data.iloc[int(row['i'])]['chembl_id']+', '+data.iloc[int(row['j'])]['chembl_id']+', '+str(row['similarity'])+')')

(CHEMBL1185564, CHEMBL1180303, 1.0)
(CHEMBL3211084, CHEMBL1219O, 1.0)
(CHEMBL3659481, CHEMBL4115592, 0.8703703703703703)
(CHEMBL1957077, CHEMBL1957074, 0.8666666666666667)
(CHEMBL3263726, CHEMBL3263727, 0.8620689655172413)
(CHEMBL3895991, CHEMBL3978459, 0.8428571428571429)
(CHEMBL2177932, CHEMBL2177959, 0.8378378378378378)
(CHEMBL1077552, CHEMBL2332107, 0.8311688311688312)
(CHEMBL108931, CHEMBL432621, 0.8305084745762712)
(CHEMBL3649850, CHEMBL3649871, 0.797752808988764)
