# Prepare Input Features

Just create a DictEncoder class that takes a dict and provides some utility methods

In [None]:
# NOTE: Why do we need to precompute the features? - bc for features with random walks
#   and similar features we want them to be the same.
# NOTE: Just have a DictEncoder class that has a from_frame method that can take a dataframe and an index
# column and create the encoding dictionary
# NOTE: The dataset needs to exist as a join table (the responses) and independent features
#   that get registered with the datsets
#   When we encode the dataset, the features are encoded along the join table
#   The features should exist a singletons since the are dict based
# NOTE: Can have different version of the encoders for dict, dataframe, etc.
# NOTE: We want to keep things as dataframes where possible so we can reconsitute things for interpretation and analysis

In [None]:
from __future__ import annotations

import pickle

import deepchem as dc
import pandas as pd
import polars as pl
import numpy as np
import typing as t

from pathlib import Path
from sklearn.impute import KNNImputer

from rdkit import Chem
from rdkit.Chem import AllChem as AllChem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.Chem.rdchem import Mol as RDKitMol
from rdkit.ML.Descriptors import MoleculeDescriptors

In [None]:
dataset_folder = Path("../../../data/datasets/GDSCv2DepMap")
genelist_folder =  Path("../../../data/genelists")

input_folder = Path("../../../data/inputs/GDSCv2DepMap")
input_folder.mkdir(exist_ok=True, parents=True)

In [None]:
# load drug screening data

screen_data = pd.read_csv(dataset_folder / "ScreenDoseResponseLabels.csv")
screen_data.head()

In [None]:
smiles_strs = screen_data["drug_id"].unique()

In [None]:
# 512 bit morgan fingerprints

smiles_to_morgan_512 = []
for smiles in smiles_strs:
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512)
    smiles_to_morgan_512.append([smiles, *list(fp)])

smiles_to_morgan_512 = pd.DataFrame(smiles_to_morgan_512)
smiles_to_morgan_512.columns = ["drug_id", *list(map(str, range(512)))]

# smiles_to_morgan_512.to_csv(
#     input_folder / "DrugToMorganFingerprint512Bit.csv",
#     index=False,
# )

In [None]:
# 1024 bit morgan fingerprints

smiles_to_morgan_1024 = []
for smiles in smiles_strs:
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    smiles_to_morgan_1024.append([smiles, *list(fp)])

smiles_to_morgan_1024 = pd.DataFrame(smiles_to_morgan_1024)
smiles_to_morgan_1024.columns = ["drug_id", *list(map(str, range(1024)))]

# smiles_to_morgan_1024.to_csv(
#     input_folder / "DrugToMorganFingerprint1024Bit.csv",
#     index=False,
# )

In [None]:
# 2048 bit morgan fingerprints

smiles_to_morgan_2048 = []
for smiles in smiles_strs:
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    smiles_to_morgan_2048.append([smiles, *list(fp)])

smiles_to_morgan_2048 = pd.DataFrame(smiles_to_morgan_2048)
smiles_to_morgan_2048.columns = ["drug_id", *list(map(str, range(2048)))]

# smiles_to_morgan_2048.to_csv(
#     input_folder / "DrugToMorganFingerprint2048Bit.csv",
#     index=False,
# )

In [None]:
# conv mol features

smiles_to_graph_features = {}
for smiles in smiles_strs:
    mol = Chem.MolFromSmiles(smiles)
    featurizer = dc.feat.graph_features.ConvMolFeaturizer()
    mol_object = featurizer.featurize([mol])

    smiles_to_graph_features[smiles] = (
        mol_object[0].atom_features,
        mol_object[0].deg_list,
        mol_object[0].canon_adj_list,
    )

# check that there are no nan feature values
assert (
    np.max([np.isnan(v[0]).sum() for v in smiles_to_graph_features.values()])
    == 0
)

# with open(input_folder / "DrugToConvMolFeatures.pickle", "wb") as fh:
#     pickle.dump(smiles_to_graph_features, fh)

In [None]:
# molecule descriptors

desc_list = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)

smiles_to_descriptors = []
for smiles in smiles_strs:
    mol = Chem.MolFromSmiles(smiles)
    ds = calc.CalcDescriptors(mol)
    smiles_to_descriptors.append([smiles, *list(ds)])

desc_names = list(calc.GetDescriptorNames())
smiles_to_descriptors = pd.DataFrame(
    smiles_to_descriptors, columns=["drug_id", *desc_names]
)

imputer = KNNImputer(n_neighbors=2)
smiles_to_descriptors.iloc[:, 1:] = imputer.fit_transform(
    smiles_to_descriptors.iloc[:, 1:]
)

smiles_to_descriptors.to_csv(
    input_folder / "DrugToMoleculeDescriptors.csv",
    index=False,
)

In [None]:
# MACCS encoding

smiles_to_maccs = []
for smiles in smiles_strs:
    mol = Chem.MolFromSmiles(smiles)
    maccs = MACCSkeys.GenMACCSKeys(mol).ToBitString()
    smiles_to_maccs.append([smiles, *list(maccs)])

smiles_to_maccs = pd.DataFrame(
    smiles_to_maccs, columns=["drug_id", *list(map(str, range(167)))]
)

smiles_to_descriptors.to_csv(
    input_folder / "DrugToMACCSKeys.csv",
    index=False,
)

In [None]:
# Mordred features
# NOTE: Only required for tDNN

# from mordred import Calculator, descriptors
# from mordred.RingCount import RingCount

# # NOTE: these will need KNN imputer for NaN values

# def get_ringsizes(mol: RDKitMol) -> list[int]:
#     """Returns counts of macrolide rings for ring sizes from 3 to 100"""
#     rs = [RingCount(order=o)(mol) for o in range(3, 101)]
#     nonzero_inds = np.nonzero(rs)[0]
#     if len(nonzero_inds) > 0:
#         rs.extend([nonzero_inds[0], nonzero_inds[-1]])
#     else:
#         rs.extend([0, 0])


# calc = Calculator(descriptors, ignore_3D=True)

# # Just going to 
# mols = [Chem.MolFromSmiles(s) for s in smiles_strs]
# descs = calc.pandas(mols, quiet=True)