# Statistic about clusters

Find the intra and inter cluster similarity and save to data/information_about_clusters

In [73]:
# Importing libraries
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from scipy.spatial.distance import jaccard
import pandas as pd
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles, MakeScaffoldGeneric
import os

In [2]:
def smiles_to_morgan(smiles, radius=3, nbits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
        return np.array(fingerprint)
    else:
        return np.zeros(nbits)

In [70]:
def jaccard_similarity(fingerprint1, fingerprint2):
    """
    Calculates the Jaccard similarity between two fingerprints.
    """
    return 1 - jaccard(fingerprint1, fingerprint2)

def calculate_cluster_similarity(clusters, receptor, name_to_save):
    """
    Calculates the similarity between clusters and within each cluster.
    """
    num_clusters = len(clusters)
    
    # Initialize the similarity matrix for clusters
    similarity_matrix = np.ones((num_clusters, num_clusters))
    
    # Calculate similarity between different clusters
    for i in range(num_clusters):
        for j in range(i+1, num_clusters):
            # Get the fingerprints for cluster i and cluster j
            cluster_i_fps = np.array([smiles_to_morgan(smiles) for smiles in clusters[i]])  # Fingerprints for cluster i
            cluster_j_fps = np.array([smiles_to_morgan(smiles) for smiles in clusters[j]])  # Fingerprints for cluster j
            
            # Calculate the average similarity between all pairs of SMILES from both clusters
            avg_similarity = np.mean([jaccard_similarity(fp_i, fp_j) for fp_i in cluster_i_fps for fp_j in cluster_j_fps])
            
            # Store the similarity in the matrix
            similarity_matrix[i, j] = avg_similarity
            similarity_matrix[j, i] = avg_similarity

    folder = f'data/information_about_clusters/{receptor}'
    if not os.path.exists(folder):
            os.makedirs(folder)

    pd.DataFrame(similarity_matrix).to_csv(f'{folder}/{name_to_save}.csv', index = False)
    return similarity_matrix

def analyze_within_cluster_similarity(clusters, receptor, name_to_save):
    """
    Analyzes the similarity within each cluster.
    """
    within_cluster_similarities = []
    
    for cluster in clusters:
        # Get fingerprints for the current cluster
        cluster_fps = np.array([smiles_to_morgan(smiles) for smiles in cluster])  # Fingerprints for the current cluster
        pairwise_similarities = []
        
        # Compare each pair within the cluster
        for i in range(len(cluster_fps)):
            for j in range(i+1, len(cluster_fps)):
                pairwise_similarities.append(jaccard_similarity(cluster_fps[i], cluster_fps[j]))
        
        # Store pairwise similarities for the current cluster
        within_cluster_similarities.append(pairwise_similarities)
    results = []
    # Generate statistics for each cluster
    for idx, similarities in enumerate(within_cluster_similarities):
        mean_sim = np.mean(similarities)  # Calculate mean similarity
        median_sim = np.median(similarities)  # Calculate median similarity
        std_sim = np.std(similarities)  # Calculate variance in similarity
        
        # Print the statistics for the current cluster
        print(f"Cluster {idx}:")
        print(f"  Mean similarity: {mean_sim}")
        print(f"  Median similarity: {median_sim}")
        print(f"  Std similarity: {std_sim}")
        results.append({
                "Cluster": idx,
                "Mean_Similarity": mean_sim,
                "Median_Similarity": median_sim,
                "Std_Similarity": std_sim
            })
    folder = f'data/information_about_clusters/{receptor}'
    if not os.path.exists(folder):
            os.makedirs(folder)
    pd.DataFrame(results).to_csv(f'{folder}/{name_to_save}.csv', index = False)


## For Glucocorticoid receptor

In [71]:
clusters = []
for num in [0,1,2,3,4]:
    df = pd.read_csv(f'data/input_recall_sets/Glucocorticoid_receptor/cRS_Glucocorticoid_receptor_dis_{num}.csv', header = None)[0].unique().tolist()

    csk = []
    for x in df:
        csk_value = MurckoScaffoldSmiles(Chem.MolToSmiles(MakeScaffoldGeneric(Chem.MolFromSmiles(x))))
        if csk_value not in csk:
            csk.append(csk_value)
    clusters.append(csk)

print("For Glucocorticoid_receptor:")
similarity_matrix = calculate_cluster_similarity(clusters, 'Glucocorticoid_receptor','mean_value_between_clusters')
print("Cluster similarity matrix:")
print(similarity_matrix)
analyze_within_cluster_similarity(clusters, 'Glucocorticoid_receptor', 'statistic_inside_clusters')

For Glucocorticoid_receptor:
Cluster similarity matrix:
[[1.         0.23690879 0.25204977 0.24802506 0.27676644]
 [0.23690879 1.         0.19621807 0.24094053 0.24796643]
 [0.25204977 0.19621807 1.         0.24199567 0.21541726]
 [0.24802506 0.24094053 0.24199567 1.         0.22449774]
 [0.27676644 0.24796643 0.21541726 0.22449774 1.        ]]
Cluster 0:
  Mean similarity: 0.3401271602563972
  Median similarity: 0.310960960960961
  Std similarity: 0.14681969809118342
Cluster 1:
  Mean similarity: 0.31379499253078597
  Median similarity: 0.27941176470588236
  Std similarity: 0.15225932150686608
Cluster 2:
  Mean similarity: 0.3751752625790344
  Median similarity: 0.3378452533382111
  Std similarity: 0.1567827215930074
Cluster 3:
  Mean similarity: 0.3396030793100419
  Median similarity: 0.31057781919850885
  Std similarity: 0.1448212665885002
Cluster 4:
  Mean similarity: 0.3811377681672276
  Median similarity: 0.35
  Std similarity: 0.16112000399367596


## For Leukocyte elastase

In [72]:
clusters = []
for num in [0,1,2,3,4]:
    df = pd.read_csv(f'data/input_recall_sets/Leukocyte_elastase/cRS_Leukocyte_elastase_dis_{num}.csv', header = None)[0].unique().tolist()

    csk = []
    for x in df:
        csk_value = MurckoScaffoldSmiles(Chem.MolToSmiles(MakeScaffoldGeneric(Chem.MolFromSmiles(x))))
        if csk_value not in csk:
            csk.append(csk_value)
    clusters.append(csk)

print("For Leukocyte_elastase")
similarity_matrix = calculate_cluster_similarity(clusters, 'Leukocyte_elastase', 'mean_value_between_clusters')
print("Cluster similarity matrix:")
print(similarity_matrix)
analyze_within_cluster_similarity(clusters, 'Leukocyte_elastase', 'statistic_inside_clusters')

For Leukocyte_elastase
Cluster similarity matrix:
[[1.         0.21800512 0.25312333 0.15202709 0.23789378]
 [0.21800512 1.         0.27343922 0.30378926 0.24089646]
 [0.25312333 0.27343922 1.         0.24757368 0.30243969]
 [0.15202709 0.30378926 0.24757368 1.         0.21636401]
 [0.23789378 0.24089646 0.30243969 0.21636401 1.        ]]
Cluster 0:
  Mean similarity: 0.36336033744779805
  Median similarity: 0.31818181818181823
  Std similarity: 0.16456197825217223
Cluster 1:
  Mean similarity: 0.37343285114236363
  Median similarity: 0.33333333333333337
  Std similarity: 0.16570652356566093
Cluster 2:
  Mean similarity: 0.3818042922994091
  Median similarity: 0.3285714285714286
  Std similarity: 0.1646255334296882
Cluster 3:
  Mean similarity: 0.39002090886427454
  Median similarity: 0.37681159420289856
  Std similarity: 0.2073126225436184
Cluster 4:
  Mean similarity: 0.3407555113040552
  Median similarity: 0.303030303030303
  Std similarity: 0.13867772268124146
