In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import itertools
from tqdm import tqdm
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client

In [12]:
def read_data_from_csv(file_path):
    """
    Read SMILES strings and ChEMBL IDs from a CSV file.
    
    Parameters:
    file_path (str): The path to the CSV file.
    
    Returns:
    pd.DataFrame: A DataFrame containing the SMILES strings and ChEMBL IDs.
    """
    df = pd.read_csv(file_path)
    return df

def calculate_fingerprints(df, smiles_column='canonical_smiles'):
    """
    Calculate molecular fingerprints for SMILES strings and add them to the DataFrame.
    
    Parameters:
    df (pd.DataFrame): A DataFrame containing the SMILES strings and ChEMBL IDs.
    smiles_column (str): The name of the column containing the SMILES strings. Default is 'canonical_smiles'.
    
    Returns:
    pd.DataFrame: The input DataFrame with additional columns for molecular fingerprints.
    """
    def fingerprint_from_smiles(smiles):
        return AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=2048)

    df['fingerprint'] = df[smiles_column].apply(fingerprint_from_smiles)
    return df

def calculate_fingerprints_partition(partition):
    """
    Helper function to calculate fingerprints on a Dask partition.
    
    Parameters:
    partition (pd.DataFrame): A partition of the Dask DataFrame.
    
    Returns:
    pd.DataFrame: The partition with added fingerprints.
    """
    return calculate_fingerprints(partition)

@delayed
def calculate_similarity_chunk(chunk):
    """
    Calculate the Tanimoto similarity for a chunk of pairs.
    
    Parameters:
    chunk (list): A list of tuples, where each tuple contains two fingerprints and their indices.
    
    Returns:
    list: A list of tuples where each tuple contains a pair of indices and their similarity score.
    """
    results = []
    for (i, fp1), (j, fp2) in chunk:
        if fp1 is not None and fp2 is not None:
            similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
            results.append(((i, j), similarity))
    return results

def chunk_combinations(indices, chunk_size):
    """
    Generate chunked combinations of indices.
    
    Parameters:
    indices (list): List of indices.
    chunk_size (int): The size of each chunk.
    
    Yields:
    list: A list of combinations.
    """
    pairs = list(itertools.combinations(indices, 2))
    for i in range(0, len(pairs), chunk_size):
        yield pairs[i:i + chunk_size]

def calculate_pairwise_similarities_dask(df, chunk_size=500):
    """
    Calculate pairwise similarities between molecular fingerprints using Dask.
    
    Parameters:
    df (pd.DataFrame): A DataFrame containing the molecular fingerprints.
    chunk_size (int): The size of each chunk for parallel processing. Default is 100.
    
    Returns:
    list: A list of tuples where each tuple contains a pair of indices and their similarity score.
    """
    indices = list(enumerate(df['fingerprint']))
    chunks = chunk_combinations(indices, chunk_size)
    # print('Number of chunks:', len(list(chunks)))
    delayed_results = [calculate_similarity_chunk(chunk) for chunk in chunks]
    similarities = compute(*delayed_results)
    return [sim for chunk in similarities for sim in chunk]

def find_top_n_similarities(similarities, df, n=10):
    """
    Find the top N similarities and return the ChEMBL IDs along with similarity scores.
    
    Parameters:
    similarities (list): A list of tuples containing pairs of indices and their similarity scores.
    df (pd.DataFrame): A DataFrame containing the SMILES strings and ChEMBL IDs.
    n (int): The number of top similarities to return. Default is 10.
    
    Returns:
    list: A list of tuples where each tuple contains a pair of ChEMBL IDs and their similarity score.
    """
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_similarities = similarities[:n]
    top_pairs_with_ids = [
        ((df.iloc[pair[0]]['chembl_id'], df.iloc[pair[1]]['chembl_id']), similarity)
        for pair, similarity in top_similarities
    ]
    return top_pairs_with_ids

In [3]:
# Path to the CSV file
file_path = 'compounds.csv'

In [4]:
# Start Dask client
client = Client()

In [5]:
client.dashboard_link

'http://127.0.0.1:8787/status'

In [6]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:64075,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:64090,Total threads: 2
Dashboard: http://127.0.0.1:64091/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64078,
Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-8sgxrz3z,Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-8sgxrz3z

0,1
Comm: tcp://127.0.0.1:64093,Total threads: 2
Dashboard: http://127.0.0.1:64094/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64080,
Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-k33dv6yk,Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-k33dv6yk

0,1
Comm: tcp://127.0.0.1:64096,Total threads: 2
Dashboard: http://127.0.0.1:64097/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64082,
Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-pf3a1p7r,Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-pf3a1p7r

0,1
Comm: tcp://127.0.0.1:64099,Total threads: 2
Dashboard: http://127.0.0.1:64100/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64084,
Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-nr2ixb8d,Local directory: /var/folders/wc/sv3gxqtn2hdc4dlt1p598r8h0000gn/T/dask-scratch-space/worker-nr2ixb8d


In [7]:
# Step 1: Read SMILES strings from the CSV file
df = read_data_from_csv(file_path)

In [8]:
meta = df.head().copy()
meta['fingerprint'] = None
ddf = dd.from_pandas(df, npartitions=4)
ddf = ddf.map_partitions(calculate_fingerprints_partition, meta=meta)
df = ddf.compute()

In [14]:
# Step 3: Calculate pairwise similarities
similarities = calculate_pairwise_similarities_dask(df, chunk_size=1000000)

2024-05-15 22:07:04,908 - bokeh.server.protocol_handler - ERROR - error handling message
 message: Message 'PATCH-DOC' content: {'events': [{'kind': 'ModelChanged', 'model': {'id': 'p5416'}, 'attr': 'inner_width', 'new': 1640}, {'kind': 'ModelChanged', 'model': {'id': 'p5416'}, 'attr': 'inner_height', 'new': 785}, {'kind': 'ModelChanged', 'model': {'id': 'p5416'}, 'attr': 'outer_width', 'new': 1660}, {'kind': 'ModelChanged', 'model': {'id': 'p5416'}, 'attr': 'outer_height', 'new': 865}]} 
 error: AssertionError()
Traceback (most recent call last):
  File "/Users/diliadis/opt/anaconda3/envs/vib_env/lib/python3.11/site-packages/bokeh/server/protocol_handler.py", line 97, in handle
    work = await handler(message, connection)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/diliadis/opt/anaconda3/envs/vib_env/lib/python3.11/site-packages/bokeh/server/session.py", line 295, in patch
    return connection.session._handle_patch(message, connection)
           ^^^^^^^^^^^^^^^^^^


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


KeyboardInterrupt: 

In [10]:
len(similarities)

0

In [19]:
# Step 4: Find the top 10 similarities
top_similarities = find_top_n_similarities(similarities, df, n=10)

In [20]:
top_similarities

[(('CHEMBL1537900', 'CHEMBL2024295'), 0.4426229508196721),
 (('CHEMBL1631214', 'CHEMBL1963348'), 0.36923076923076925),
 (('CHEMBL3427354', 'CHEMBL3194515'), 0.36065573770491804),
 (('CHEMBL3482197', 'CHEMBL1891029'), 0.3380281690140845),
 (('CHEMBL290316', 'CHEMBL1972865'), 0.3380281690140845),
 (('CHEMBL2047290', 'CHEMBL1622164'), 0.3333333333333333),
 (('CHEMBL1315319', 'CHEMBL1569164'), 0.323943661971831),
 (('CHEMBL46330', 'CHEMBL1513292'), 0.3229166666666667),
 (('CHEMBL1957232', 'CHEMBL1419942'), 0.3150684931506849),
 (('CHEMBL1464949', 'CHEMBL1353759'), 0.3125)]

In [43]:
# Print the results
for pair, similarity in top_similarities:
    print(f"Compound {pair[0]} and Compound {pair[1]}: Similarity = {similarity:.4f}")

Compound CHEMBL1185564 and Compound CHEMBL1180303: Similarity = 1.0000
Compound CHEMBL3211084 and Compound CHEMBL1219O: Similarity = 1.0000
Compound CHEMBL3659481 and Compound CHEMBL4115592: Similarity = 0.8704
Compound CHEMBL1957077 and Compound CHEMBL1957074: Similarity = 0.8667
Compound CHEMBL3263726 and Compound CHEMBL3263727: Similarity = 0.8621
Compound CHEMBL3895991 and Compound CHEMBL3978459: Similarity = 0.8429
Compound CHEMBL2177932 and Compound CHEMBL2177959: Similarity = 0.8378
Compound CHEMBL1077552 and Compound CHEMBL2332107: Similarity = 0.8312
Compound CHEMBL108931 and Compound CHEMBL432621: Similarity = 0.8305
Compound CHEMBL3649850 and Compound CHEMBL3649871: Similarity = 0.7978
