# Extra Features from Featurizer

# Loading packages

In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from chemprop import data, utils
from chemprop.featurizers import MoleculeFeaturizer
from rdkit.Chem import rdFingerprintGenerator
from dataclasses import dataclass

# Change data inputs here

In [None]:
test_path = '../tests/data/regression.csv'
target_columns = ['logSolubility']

In [None]:
df_test = pd.read_csv(test_path)
df_test

In [None]:
smis = df_test['smiles']
ys = df_test.loc[:, target_columns].values

# Creating custom featurizers
Custom featurizers can be made by inheriting the ```MoleculeFeaturizer``` class.
These featurizers must override the following methods:
- ```__len__(self)```
- ```__call__(self, mol: Chem.mol)```

In [None]:
@dataclass
class MorganFingerprintMoleculeFeaturizer(MoleculeFeaturizer):
    fp_size: int = 2048

    def __post_init__(self):
        self.mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=self.fp_size)

    def __len__(self) -> int:
        """the length of the feature vector"""
        return self.fp_size

    def __call__(self, mol: Chem.Mol) -> np.ndarray:
        """Featurize the molecule ``mol``"""
        fp = self.mfpgen.GetFingerprintAsNumPy(mol)
        return fp

## Testing the featurizer

In [None]:
mf = MorganFingerprintMoleculeFeaturizer()
morgan = mf(utils.make_mol(smis[0], keep_h=False, add_h=False))
morgan.shape, morgan

## Loading featurizers into datapoints

In [None]:
mfs = [MorganFingerprintMoleculeFeaturizer()] # supply a list of all featurizers that 
                                              # will generate the extra features. This is separate 
                                              # from the main featurizer supplied to molecule datasets.
                                              
# An arbitrary amount of molecule featurizers can be supplied to each datapoint in a dataset.
# Note that pre-obtained extra features cannot also be added at the same time, as shown in
# the loaded molecule features notebook. An attempt to do so will result in an error.
                                              
all_data = [data.MoleculeDatapoint.from_smi(smi, y=y, mfs=mfs) for smi, y in zip(smis, ys)]
all_data[:5]