In [43]:
import os
import pandas as pd
from rdkit import Chem

In [129]:
files = os.listdir()
files = [file for file in files if file.endswith('.csv')]
files

['AID_884_datatable_all.csv',
 'AID_883_datatable_all.csv',
 'AID_899_datatable_all.csv',
 'AID_891_datatable_all.csv',
 'AID_410_datatable_all.csv']

In [131]:
def preprocess(files):
    final_db = pd.DataFrame(columns = ['PUBCHEM_EXT_DATASOURCE_SMILES','PUBCHEM_ACTIVITY_OUTCOME','PUBCHEM_ACTIVITY_SCORE'])
    for file in files:
        data = pd.read_csv(file, skiprows=[1,2,3,4,5], low_memory=False)
        data = data[['PUBCHEM_EXT_DATASOURCE_SMILES','PUBCHEM_ACTIVITY_OUTCOME','PUBCHEM_ACTIVITY_SCORE']]
        final_db = pd.concat([final_db, data], ignore_index=True)
    return final_db

In [132]:
db = preprocess(files)

In [133]:
db = db[db['PUBCHEM_EXT_DATASOURCE_SMILES'].notna()]
db = db[db['PUBCHEM_ACTIVITY_OUTCOME']!='Inconclusive']

In [134]:
def simple_is_organic(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        simpleOrganicAtomQuery = Chem.MolFromSmarts('[!$([#1,#5,#6,#7,#8,#9,#15,#16,#17,#35,#53])]')
        simpleOrganicBondQuery = Chem.MolFromSmarts('[#6]-,=,#,:[#6]')
        hasCHQuery = Chem.MolFromSmarts('[C!H0]')
        return (not mol.HasSubstructMatch(simpleOrganicAtomQuery)) and mol.HasSubstructMatch(hasCHQuery) and mol.HasSubstructMatch(simpleOrganicBondQuery)
    except AttributeError:
        return False

In [135]:
db['is_organic'] = db['PUBCHEM_EXT_DATASOURCE_SMILES'].apply(simple_is_organic)

In [136]:
db['is_organic'].value_counts()

is_organic
True     42707
False     3390
Name: count, dtype: int64

In [137]:
db = db[db['is_organic']==True]

In [138]:
db['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

PUBCHEM_ACTIVITY_OUTCOME
Inactive    30385
Active      12322
Name: count, dtype: int64

In [151]:
db.drop_duplicates(subset='PUBCHEM_EXT_DATASOURCE_SMILES', keep="last", inplace=True)

In [153]:
db['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

PUBCHEM_ACTIVITY_OUTCOME
Inactive    6612
Active      4923
Name: count, dtype: int64

In [157]:
db.to_csv('final_data.csv')