# Data Preprocessing: Molecule Standardization, Deduplication and Feature Profile Calculation

In [None]:
## Imports

import os

import sys
sys.path.insert(0, '../assay_inspector')

from AI_MoleculeInfo import MoleculeInfo
from AI_MoleculeData import MoleculeData
from AI_FeaturesPreprocessing import FeaturesPreprocessing

In [None]:
import random
seed = 42
random.seed(seed)

In [None]:
endpoints = {'half_life':'logHL', 'clearance':'logCL'}
features = ['ecfp4', 'rdkit_ecfp4']
features_dict = {'rdkit': [MoleculeInfo.FEAT_RDKIT_DESC], 'ecfp4': [MoleculeInfo.FEAT_ECFP4], 'rdkit_ecfp4':[MoleculeInfo.FEAT_RDKIT_DESC, MoleculeInfo.FEAT_ECFP4] }

In [None]:
for endpoint in endpoints:
    for feature in features:

        # Load data
        data = MoleculeData(source=os.path.join(os.getcwd(), '..', 'data', endpoint, f'{endpoints[endpoint]}_aggregated_dataset.tsv'))

        # Deduplicate the dataset
        data._deduplicate(subset=['inchikey'], endpoint2task={endpoint:'REGRESSION'})

        # Perform feature preprocessing
        preprocessing = FeaturesPreprocessing()
        data = preprocessing.fit_transform(data, features_ids=features_dict[feature], endpoint2task={endpoint:'REGRESSION'})

        # Get the molecule DataFrame
        data_df = data.DataFrame(features=features_dict[feature],
                                 columns=['inchikey','smiles','value','ref','endpoint'])
        
        # Save the final dataframe
        data_df.to_csv(os.path.join(os.getcwd(), '..', 'data', endpoint, f'{endpoints[endpoint]}_{feature}_dataset.tsv'), sep='\t', index=False)