## OPTIONAL: to test that VS retrieves as expected
This is not required for the app

In [0]:
%pip install databricks-vectorsearch databricks-langchain rdkit mols2grid
dbutils.library.restartPython()

In [0]:
%pip freeze

In [0]:
from databricks.vector_search.client import VectorSearchClient
import pandas as pd
import numpy as np
import rdkit
from rdkit.Chem import MolFromSmiles, AllChem
import mols2grid
from IPython.display import display

In [0]:
endpoint_name="zinc_vs"
vs_index="yen.qsar.zinc_vs"

In [0]:
client = VectorSearchClient()
index = client.get_index(index_name=vs_index)

In [0]:
def get_ecfp(mol: rdkit.Chem.rdchem.Mol, radius: int=2, fpSize: int=1024) -> np.array:
    fpgen = AllChem.GetMorganGenerator(radius=radius, fpSize=fpSize)
    return fpgen.GetFingerprintAsNumPy(mol)

In [0]:
# test molecule: Furanylfentanyl
test_smiles = "O=C(C1=CC=CO1)N(C2=CC=CC=C2)C3CCN(CCC4=CC=CC=C4)CC3"
test_mol = MolFromSmiles(test_smiles)
test_embedding = get_ecfp(test_mol)
print(test_embedding.tolist())

In [0]:
results = index.similarity_search(
    query_vector=test_embedding.tolist(),
    # for zinc_vs
    columns=["zinc_id", "smiles", "mwt", "logp", "ecfp"],
    # for drugbank_vs
    # columns=["id", "name", "smiles", "molecular_weight", "ECFP"],
    num_results=3,
    #filters={"molecular_weight >": 250, "molecular_weight <=": 500}
    )

In [0]:
columns = [i['name'] for i in results['manifest']['columns']]
columns

In [0]:
results_df = pd.DataFrame(results['result']['data_array'], columns=columns)
#results_df['mol'] = results_df["smiles"].apply(MolFromSmiles)
results_df

In [0]:
mols2grid.display([test_mol])

In [0]:
mols2grid.display(
    results_df,
    smiles_col="smiles",
    # set the fields  displayed on the grid
    #for drug_vs
    #subset=["name", "score"],
    #tooltip=["id", "molecular_weight"]
    # for zinc_vs
    subset=["zinc_id", "score"]
)