 #  <center> Problem Set 4 <center>

<center> 7.C01/7.C51, 20.C01/20.C51 <center>

<b>Name:</b>

<b>Kerberos id:</b>

### Download required data & install packages

In [None]:
!wget https://raw.githubusercontent.com/coleygroup/ML4MolEng/main/psets/ps4/data/bio_version/jak2.csv
!pip install rdkit
!pip install molvs

In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import DBSCAN

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from molvs import standardize_smiles

## Part 1: Dimensionality Reduction for Molecular Representations

This may return a depreciation warning, which can be ignored

In [None]:
################ Run #################

# convert SMILES strings to Morgan fingerprints with rdkit
jak2 = pd.read_csv("jak2.csv")
radius = 3
num_bits = 2048

class ECFP:
    def __init__(self, smiles):
        self.mols = [Chem.MolFromSmiles(i) for i in smiles]
        self.smiles = smiles

    def mol2fp(self, mol):
        bi = {}
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius,
                                                   bitInfo=bi, nBits=num_bits)
        array = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, array)

        return array, bi

    def compute_ECFP(self):
        bit_headers = ["bit" + str(i) for i in range(num_bits)]
        arr = np.empty((0, num_bits), int).astype(int)
        bitInfo_all = []
        mol_all = []

        for i in self.mols:
            mol_all.append(i)
            fp, bi = self.mol2fp(i)
            arr = np.vstack((arr, fp))
            bitInfo_all.append(bi)

        df_fp = pd.DataFrame(np.asarray(arr).astype(int),columns=bit_headers)
        df_fp.insert(loc=0, column="smiles", value=self.smiles)
        df_fp.insert(loc=1, column="mol", value=mol_all)
        df_fp.insert(loc=2, column="bitInfo", value=bitInfo_all)

        return df_fp

smiles_standarized = [standardize_smiles(i) for i in jak2["SMILES"].values]
jak2_fp_descriptor = ECFP(smiles_standarized)
jak2_fp = jak2_fp_descriptor.compute_ECFP()

# remove first column as we will reference smiles column from "jak2" dataframe
jak2_fp = jak2_fp.drop(columns=["smiles", "mol", "bitInfo"])  # second/third not needed

################ Run #################

This resulting dataframe, `jak2_fp`, contains the 2048 bits (columns) making up the fingerprints for the 1,911 molecules (rows)

### 1.1 (5 points, Grad only) Choosing radius and number of bits for Morgan fingerprints

Provide a one-sentence description of what the radius represents and another of what the number of bits represents. How does adjusting the radius parameter affect the granularity of the motifs captured by the fingerprints, and how does this relate to the choice of the number of bits?

In [None]:
################ Solution #################

"""
Radius represents the distance (in bonds) from each atom at which neighboring
atoms are considered when generating the fingerprint.

Number of bits represents the size of the fingerprint vector,
determining the number of molecular features captured.

Adjusting the radius parameter affects the granularity of the motifs captured
by the fingerprints by determining the spatial extent of the substructures
considered. A larger radius includes atoms that are further away from the
central atom, capturing larger and more complex molecular features.
This relates to the choice of the number of bits as a larger radius increases
the number of unique substructures encountered, requiring more bits to
represent them adequately in the fingerprint vector.
"""

################ Solution #################

### 1.2 (10 points) Principal Component Analysis on Molecular Fingerprints

Perform PCA to reduce data into vectors of 100 dimensions. Visualize the first two components of your data in a 2D scatter plot and color each molecule by its pIC50.

In [None]:
################ Solution #################

# perform PCA
pca = PCA(n_components=100)
crds = pca.fit_transform(jak2_fp)

# skeleton code for plotting
fig, ax = plt.subplots(figsize=(5, 5))
sc = ax.scatter(crds[:, 0], crds[:, 1], s=3, c=jak2["pIC50"], cmap="viridis")
cbar = plt.colorbar(sc)
cbar.set_label("pIC50", rotation=270, labelpad=15)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.show()

################ Solution #################

What is the explained variance ratio of the 100 principal components?

In [None]:
################ Solution #################

print(f"The first 100 components explains {pca.explained_variance_ratio_.sum()} of the total variance")

################ Solution #################

What patterns do you observe?

In [None]:
################ Solution #################

"""
There appears to be a clear cluster of molecules that have high pIC50 values.
Molecules with low-to-moderate pIC50 have poor separation.
"""

################ Solution #################

### 1.3 (10 points) t-SNE analysis on Molecular Fingerprints

Perform t-SNE on the obtained principal components, with perplexity value of 2, 30, and 500. Plot the results and label your plots.

In [None]:
################ Solution #################

fig, ax = plt.subplots(figsize=(18, 6), ncols = 4, gridspec_kw={"width_ratios": [1, 1, 1, 0.05]})

perplexities = [2, 30, 500]
for i, perplexity in enumerate(perplexities):
    pca_tsne = TSNE(n_components=2, perplexity=perplexity).fit_transform(crds)

    ax[i].scatter(pca_tsne[:,0], pca_tsne[:,1], s=3, c=jak2["pIC50"])
    ax[i].set_xlabel("t-SNE1")
    ax[i].set_ylabel("t-SNE2")
    ax[i].set_title(f"Perplexity = {perplexity}")

cbar = plt.colorbar(sc, cax=ax[-1])
cbar.set_label("pIC50", rotation=270, labelpad=15)

plt.tight_layout()
plt.show()

################ Solution #################

What differences do you see between the 3 t-SNE plots? What patterns do you observe in the perplexity = 30 plot?

In [None]:
################ Solution #################

"""
The perplexity = 2 plot has a lot of smaller clusters. Both the
perplexity = 30 and perplexity = 500 plots demonstrate clearer separation
of molecules with high pIC50 values, with the perplexity = 500 demonstrating
a bit more random spread. In particular, in the perplexity = 30 plot, high
pIC50 molecules form two distinct clusters, and the low-to-moderate pIC50
molecules are largely randomly distributed amongst each other.

Some version of this answer is acceptable, as long as perplexity = 2 plot is
recognized as inferior.
"""

################ Solution #################

### 1.4 (20 points) Are the low dimensional embeddings meaningful?

Discretize pIC50 data by classifying any molecule with a pIC50 value >= 9.5 as effective (i.e., 1) and any with <9.5 as ineffective (i.e., 0). Append this as a new column called `is_effective` to the `jak2` dataframe.

In [None]:
################ Solution #################

jak2["is_effective"] = (jak2["pIC50"] >= 9.5).astype(int)

################ Solution #################

Split the data into 10 folds. For each fold, train on the other 9 folds and validate on the last fold. Record your prediction.

In [None]:
################ Solution #################

kf = KFold(n_splits=10, shuffle=True)
for train_index, test_index in kf.split(jak2_fp):

    X_train, y_train = np.array(jak2_fp)[train_index], jak2.is_effective.values[train_index]
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)

    X_test = np.array(jak2_fp)[test_index]
    pred = rf.predict(X_test)
    for i, index in enumerate(test_index):
        jak2.loc[index, "pred"] = pred[i]

################ Solution #################

Classify your predictions into True Positives (TP), True Negatives (TN), False Positives (FP) and False Negatives (FN).

In [None]:
################ Solution #################

for index, row in jak2.iterrows():

    if row.is_effective == 1:
        if row.pred == 1:
            label = "TruePostive"
        else:
            label = "FalseNegative"

    if row.is_effective == 0:
        if row.pred == 1:
            label = "FalsePositve"
        else:
            label = "TrueNegative"

    jak2.loc[index, "label"] = label

################ Solution #################

Plot the 2D t-SNE embeddings (perplexity = 30) colored by the four classification classes.

In [None]:
################ Solution #################

pca_tsne = TSNE(n_components=2, perplexity=30).fit_transform(crds)

fig, ax = plt.subplots(figsize=(7,7))
ax.scatter(pca_tsne[jak2.label == "TruePostive"][:, 0], pca_tsne[jak2.label == "TruePostive"][:, 1], color="blue", s= 3, label="TP")
ax.scatter(pca_tsne[jak2.label == "FalsePostive"][:, 0], pca_tsne[jak2.label == "FalsePostive"][:, 1], color="red", s= 6, label="FP")
ax.scatter(pca_tsne[jak2.label == "TrueNegative"][:, 0], pca_tsne[jak2.label == "TrueNegative"][:, 1], color="green", s= 3, label="TN", alpha=0.3)
ax.scatter(pca_tsne[jak2.label == "FalseNegative"][:, 0], pca_tsne[jak2.label == "FalseNegative"][:, 1], color="cyan", s= 6, label="FN")

ax.legend()
ax.set_xlabel("t-SNE1")
ax.set_ylabel("t-SNE2")

plt.show()

################ Solution #################

What pattern do you observe?

In [None]:
################ Solution #################

"""
The classifier is fairly good at classifying the cluster molecules revealed
by the t-SNE's analysis.

For the effective drugs that are mixed with other ineffective drugs, the
classifier tends to classify them as ineffective.
"""

################ Solution #################

### 1.5 (10 points) UMAP analysis on Molecular Fingerprints

Perform UMAP on the obtained principal components. Plot  results.

In [None]:
################ Solution #################

fig, ax = plt.subplots(figsize=(6, 6))

pca_umap = UMAP(n_components=2, n_neighbors=10, min_dist=1.0).fit_transform(crds)
ax.scatter(pca_umap[:,0], pca_umap[:,1], s=3, label="ineffective")
ax.scatter(pca_umap[jak2["pIC50"] >= 9.5][:, 0],
           pca_umap[jak2["pIC50"] >= 9.5][:, 1], color="red", s=3, label="effective")
ax.legend()

################ Solution #################

### 1.6 (15 points) Visualize latent clusters for structure similarity

First run DBSCAN on only active molecules and visualize with labels.

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

################ Solution #################

labels = DBSCAN(eps=0.8, min_samples=4).fit_predict(pca_umap[jak2["pIC50"] >= 9.5])

################ Solution #################

ax.scatter(pca_umap[jak2["pIC50"] >= 9.5][:, 0],
           pca_umap[jak2["pIC50"] >= 9.5][:, 1],
           c=labels)

# add labels as text
for label in np.unique(labels):
    idx = np.argwhere(labels == label)[0]
    ax.text(pca_umap[jak2["pIC50"] >= 9.5][idx, 0],
            pca_umap[jak2["pIC50"] >= 9.5][idx, 1],
            label, size=20)

Now pick one of the clusters, by setting the label, and visualize.

In [None]:
################ Run #################

label = 3
cluster = jak2.SMILES[jak2["pIC50"] >= 9.5][labels == label]

# visualize all molecules in cluster
mol_list = []
for mol in cluster:
    mol_list.append(Chem.MolFromSmiles(mol))
Draw.MolsToGridImage(mol_list)

################ Run #################

Comment on the similarity of structures within the cluster. Explore by trying a few different clusters.

In [None]:
################ Solution #################

"""
There is strong structural similarity among each cluster, indicating that
this latent space captures such features reasonably well.
"""

################ Solution #################