In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from MACCS import *

In [None]:
# Initiate important variable and functions
def molsmile (smiles):
    moldata = []
    for m in smiles: # iterate through iterables in smiles
        mol = Chem.MolFromSmiles(m) # convert smiles to mols
        moldata.append(mol)
    return moldata
    
radius = [3]
bit_num = [2048]
combined = [(r,b) for r in radius for b in bit_num]
t = 'phytochem'

In [None]:
# Select a random 200 compound from the KEGG Phytochemical Database List
df = pd.read_csv('phytochem_data/phytochem_compound_list.csv')
df = df.sample(n = 200, random_state = 42)
name_series = df.cmpdname
smile_series = df.isosmiles
name_series.to_csv(f'phytochem_data/fingerprints/phytochem_names.csv', index=False)
smile_series.to_csv(f'phytochem_data/smiles/phytochem_smiles.csv', index=False)
smiles_list = df.isosmiles.to_list()
mol_list = molsmile(smiles_list) # turn list to mol data

In [None]:
# Generate Fingerprints
# Atom Pair Generator
fpgen = AllChem.GetAtomPairGenerator()
pairfps = [fpgen.GetFingerprint(x) for x in mol_list]
pairfps_lists = [list(l) for l in pairfps]
pairfps_name = [f'pairfps_bit_{i}' for i in range(len(pairfps[1]))] 
pairfps_df = pd.DataFrame(pairfps_lists, columns=pairfps_name)
pairfps_df.to_csv(f'phytochem_data/fingerprints/{t}_pairfps.csv', index=False)

# ECFP Generator
for r, b, in combined:
    fingerprint = [AllChem.GetMorganFingerprintAsBitVect(x,r, nBits = b) for x in mol_list] # Create efcp6 fingerprints
    fingerprint_lists = [list(l) for l in fingerprint] # turn fingerprints to list
    fingerprint_name = [f'ecfp_bit_{i}' for i in range(len(fingerprint[1]))] # create bit name
    fingerprint_df = pd.DataFrame(fingerprint_lists, columns=fingerprint_name) # create efcp dataframe
    rad_name = r*2
    fingerprint_df.to_csv(f'phytochem_data/fingerprints/{t}_ECFP{rad_name}_{b}.csv', index=False)

# RDKit Fingerprint Generator
fpgen = AllChem.GetRDKitFPGenerator()
rdk = [fpgen.GetFingerprint(x) for x in mol_list]
rdk_lists = [list(l) for l in rdk]
rdk_name = [f'rdk_bit_{i}' for i in range(len(rdk[1]))] 
rdk_df = pd.DataFrame(rdk_lists, columns=rdk_name)
rdk_df.to_csv(f'phytochem_data/fingerprints/{t}_rdk.csv', index=False)