# t-SNE dimensionality reduction based on the Tanimoto metric

## Necessary imports

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit.Chem import DataStructs
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import plotly.express as px

## Morgan fingerprint + Tanimoto distance functions (based on RDKit)

In [2]:
def smiles_to_fingerprints(smiles_list, radius=2, n_bits=2048):
    fps = []
    generator = GetMorganGenerator(radius, n_bits)
    
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = generator.GetFingerprint(mol)
            fps.append(fp)
        else:
            fps.append(None)
    
    return fps


def tanimoto_distance_matrix(fps):
    n = len(fps)
    dists = np.zeros((n, n))
    for i in range(n):
        for j in range(i + 1, n):
            if fps[i] is None or fps[j] is None:
                dist = 1.0
            else:
                dist = 1 - DataStructs.TanimotoSimilarity(fps[i], fps[j]) #Also known as Jaccard distance
            dists[i, j] = dist
            dists[j, i] = dist
    return dists


## Main t-SNE function (with interactive plot using plotly)

In [3]:
def interactive_tsne_map(df, output_col, name_col, smiles_col, perplexity=20, random_state=42):
    smiles = df[smiles_col].tolist()
    names = df[name_col].tolist() if name_col in df.columns else smiles
    values = df[output_col].tolist()

    fps = smiles_to_fingerprints(smiles)
    dist_matrix = tanimoto_distance_matrix(fps)

    tsne = TSNE(metric='precomputed', perplexity=perplexity,
                random_state=random_state, max_iter=1000, init='random')
    tsne_result = tsne.fit_transform(dist_matrix)

    tsne_df = pd.DataFrame(tsne_result, columns=['x', 'y'])
    tsne_df['SMILES'] = smiles
    tsne_df['Name'] = names
    tsne_df[output_col] = values

    # Run DBSCAN clustering on t-SNE coords
    dbscan = DBSCAN(eps=3, min_samples=3)
    clusters = dbscan.fit_predict(tsne_result)
    tsne_df["Cluster"] = clusters

    fig = px.scatter(tsne_df, x='x', y='y',
                     color=output_col,
                     hover_data={
                         'Name': True,
                         'SMILES': True,
                         output_col: True,
                         'x': False,
                         'y': False
                     },
                     color_continuous_scale='Viridis',
                     title='Interactive t-SNE Map (Tanimoto Distance)')
    
    fig.update_traces(marker=dict(size=9, line=dict(width=0.5, color='DarkSlateGrey')))
    fig.update_layout(legend_title=output_col, hoverlabel=dict(bgcolor='white'))
    fig.show()

    return tsne_df


## Load dataset & run
### (Note directory)

In [None]:
df = pd.read_csv("input.csv")

tsne_df = interactive_tsne_map(
    df,
    output_col='Inh Power',
    name_col='Inhibitor Name',
    smiles_col='SMILES'
)
