In [30]:
import pandas as pd
import seaborn as sns
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import TanimotoSimilarity
from gensim.models import word2vec
from rdkit.Chem import Descriptors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import os
import mol2vec
from mol2vec.features import mol2alt_sentence
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage

In [20]:
def compute_similarities (molecules, metric) : 
    '''
    computes pairwise similarity or euclidean distance between all the elements of the list
    '''
    L = len(molecules)
    results = np.empty((L, L))
    for i in range(L) : 
        for j in range(L): 
            if (metric == 'tanimoto') : 
                results[i, j] = TanimotoSimilarity(molecules[i], molecules[j])
            elif (metric == 'euclidian') : 
                results[i, j] = np.linalg.norm(molecules[i] - molecules[j])
    result = pd.DataFrame(results, index=['row1', 'row2', 'row3'], columns=['col1', 'col2', 'col3'])#TODO add names
    return result

In [4]:
def calculate_rdkit_descriptors(molecule):
    # List of all descriptor names and functions
    descriptor_results = []

    for name, func in Descriptors.descList:
        try:
            # Calculate each descriptor
            descriptor_results.append(func(molecule))
        except Exception as e:
            # Handle cases where a descriptor fails
            descriptor_results.append(None)

    return descriptor_results

### 1. Represent the ligands in an embedding space

#### a. Morgan Fingerprints (ECFP) 

Morgan fingerprint captures the local environment around each atom up to a specified radius. 

In [None]:
# Generate Morgan Fingerprints (ECFP) with radius=2 and 1024 bits (ECFP4)
ecfp_rad2 = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in df_subset['RDKIT Molecules']]
df_subset['ecfp_rad2'] = ecfp_rad2

#### b. Mol2vec 

To detect more global and subtile (but less interpretable) differences we use here a pretrained Mol2vec available here : https://github.com/samoturk/mol2vec/tree/master/examples/models 


This model  was trained on 20 million compounds downloaded from ZINC using:
- radius 1
- UNK to replace all identifiers that appear less than 4 times
- skip-gram and window size of 10 
- resulting in 300 dimensional embeddings

Mol2Vec is inspired by Word2vec and it creates embeddings based on the structural and chemical information of the molecules using a substructure approach. 

In [None]:
pretrained_model_path = os.path.join(os.getcwd(), 'model_300dim.pkl') 
model = word2vec.Word2Vec.load(pretrained_model_path)
print('number of unique identifiers', len(model.wv.key_to_index))

execution_times = []

mol2vec_embeddings = []
for mol in df_subset['RDKIT Molecules']:
    start_time = time.time()
    identifier = mol2alt_sentence(mol, 1)
    try:
        # Assuming model.wv[identifier] returns a vector
        embedding = model.wv[identifier]
    except KeyError:
        # If identifier not found in vocabulary, assign NaN
        embedding = np.nan
    mol2vec_embeddings.append(embedding)
    end_time = time.time()
    execution_times.append(end_time - start_time)

# Calculer les statistiques
mean_time = np.mean(execution_times)
std_time = np.std(execution_times)

print(f"Mean execution time: {mean_time} seconds")
print(f"Standard deviation: {std_time} seconds")
print(f"Estimation for the full dataset: {round(len(df_bindingDB)*mean_time/60)} minutes")

df_subset['mol2vec embedding'] = mol2vec_embeddings

#### c.RDKitDescriptors 

Here we create an embedding space based on the chemical and structural characteristics of the molecules. Note that here we take all the descriptors but it could be a could idea to filter them in order avoid redundancy and by doing so reducing the dimensionality of the points. 

In [None]:
rdkit_embeddings = [list(calculate_rdkit_descriptors(mol)) for mol in df_subset['RDKIT Molecules']]
#Note : since this cell takes too much time to run, it will be necessary to select the most important descriptors

### 2. Study the clusters

In [None]:
# Select the embeddings of interest
embeddings = ecfp_rad2  # or rdkit_embeddings or ecfp_rad2 or mol2vec_embeddings

# select the similarity metric
metric = 'tanimoto' # or 'euclidian'.Use euclidian for mol2vec and rdkit and tanimoto for ecfp

NameError: name 'mol2vec_embeddings' is not defined

#### a. Similarity measurement

The goal here is to create an heatmap with the Tanimoto similarity or euclidian distance between each point and see if ligands that have the same target are more similar compared to the others. 

In [None]:

similarities = compute_similarities(embeddings, metric)
sns.heatmap(similarities, annot=True)


#### b. Dimensionality reduction 

The goal here is to apply dimensionality reduction techniques on the embedded representation, vizualise the ligands according to the first two components and see if visually there are any clusters and to what extend those depend on the targets. 

In [None]:
pca_model = PCA(n_components=30)
tsne_model = TSNE(n_components=2, perplexity=10, n_iter=1000, metric = 'cosine')
X_pca = pca_model.fit_transform(embeddings)
X_tsne_pca = tsne_model.fit_transform(X_pca)
#X_tsne = tsne_model.fit_transform(embeddings)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', s=50, alpha=0.7)
plt.colorbar(label='Cluster Label')
plt.title('PCA: First Two Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(X_tsne_pca[:, 0], X_tsne_pca[:, 1], c=y, cmap='viridis', s=50, alpha=0.7)
plt.colorbar(label='Cluster Label')
plt.title('PCA followed by tsne: First Two Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

#### c. Clustering

The goal here is to cluster the ligands in the embedding space and see if they are clustered based on the target they bind to.

In [None]:
# Perform k-means and hierarchical clustering

# K-means clustering
kmeans = KMeans(n_clusters=2) #TODO : replace with the number of targets of the family
kmeans.fit(embeddings)
kmeans_labels = kmeans.labels_

# Hierarchical clustering (Agglomerative) 
Z = linkage(embeddings, 'ward')  
plt.figure(figsize=(10, 7))
dendrogram(embeddings)
plt.show()