In [2]:
import pandas as pd

# Load the TSV file to extract compound names
file_path_tsv = 'Open-tggates_AllAttribute.tsv'
 
#  Attempt to load the TSV file with a different encoding
data_tsv = pd.read_csv(file_path_tsv, sep='\t', encoding='ISO-8859-1')

# Extract unique compound names
unique_compounds = data_tsv['COMPOUND_NAME'].unique()

# Display the unique compound names
unique_compounds


  data_tsv = pd.read_csv(file_path_tsv, sep='\t', encoding='ISO-8859-1')


array(['acetaminophen', 'isoniazid', 'carbon tetrachloride',
       'valproic acid', 'phenobarbital', 'rifampicin', 'phenylbutazone',
       'clofibrate', 'naphthyl isothiocyanate', 'allyl alcohol',
       'indomethacin', 'chlorpromazine', 'thioacetamide', 'omeprazole',
       'ethionine', 'aspirin', 'carbamazepine', 'diclofenac',
       'nitrofurantoin', 'benzbromarone', 'hexachlorobenzene', 'diazepam',
       'cyclophosphamide', 'methapyrilene', 'phenytoin', 'WY-14643',
       'gemfibrozil', 'bromobenzene', 'coumarin', 'allopurinol',
       'propylthiouracil', 'amiodarone', 'sulfasalazine', 'cimetidine',
       'perhexiline', 'azathioprine', 'ketoconazole', 'glibenclamide',
       'adapin', 'labetalol', 'methyltestosterone', 'haloperidol',
       'thioridazine', 'pemoline', 'chlormezanone', 'metformin',
       'fluphenazine', 'methimazole', 'monocrotaline', 'vitamin A',
       'griseofulvin', 'flutamide', 'tamoxifen', 'ethinylestradiol',
       'methyldopa', 'tetracycline', 'lomustin

In [5]:
from pubchempy import get_compounds
from tqdm import tqdm  # tqdm 모듈을 import
import pandas as pd

# Initialize a dictionary to store compound names and their SMILES
compound_smiles = {}

# Loop through each compound name and retrieve its SMILES string using PubChemPy
for compound in tqdm(unique_compounds, desc="Fetching SMILES"):
    try:
        # Get the compound from PubChem
        compounds = get_compounds(compound, 'name')
        if compounds:
            # Store the SMILES string
            compound_smiles[compound] = compounds[0].isomeric_smiles
        else:
            compound_smiles[compound] = None  # No SMILES found
    except Exception as e:
        compound_smiles[compound] = None  # Handle any exceptions

# Convert the dictionary to a DataFrame for easier handling
smiles_df = pd.DataFrame(list(compound_smiles.items()), columns=['Compound', 'SMILES'])

# Save the DataFrame to a CSV file
smiles_df.to_csv('compound_smiles.csv', index=False)

# Display the first few entries of the DataFrame
print(smiles_df.head())


Fetching SMILES: 100%|██████████| 170/170 [02:49<00:00,  1.00it/s]

               Compound                             SMILES
0         acetaminophen              CC(=O)NC1=CC=C(C=C1)O
1             isoniazid                 C1=CN=CC=C1C(=O)NN
2  carbon tetrachloride                    C(Cl)(Cl)(Cl)Cl
3         valproic acid                    CCCC(CCC)C(=O)O
4         phenobarbital  CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2





In [6]:
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
import pandas as pd

# Example SMILES for the compound of interest (e.g., Chlorpromazine)
target_smiles = 'CN(C)CCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl'
target_mol = Chem.MolFromSmiles(target_smiles)
target_fp = FingerprintMols.FingerprintMol(target_mol)

# Assume smiles_df is a DataFrame with a column 'SMILES' containing the SMILES strings of other compounds
smiles_list = [ 'CC(C)CC1=CC=C(C=C1)C(C)C', 'CN(C)CCC2=CC=CC=C2', 'CN(C)CCOC(=O)C1=CC=CC=C1' ]
compounds = ['Compound A', 'Compound B', 'Compound C']

smiles_df = pd.DataFrame({'Compound': compounds, 'SMILES': smiles_list})

# Calculate similarity with the target compound
similarities = []
for smiles in smiles_df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    fp = FingerprintMols.FingerprintMol(mol)
    similarity = DataStructs.FingerprintSimilarity(target_fp, fp)
    similarities.append(similarity)

smiles_df['Similarity'] = similarities

# Sort the compounds by similarity
smiles_df = smiles_df.sort_values(by='Similarity', ascending=False)

# Display the top potential analogs
print(smiles_df)


     Compound                    SMILES  Similarity
0  Compound A  CC(C)CC1=CC=C(C=C1)C(C)C    0.408000
1  Compound B        CN(C)CCC2=CC=CC=C2    0.350806
2  Compound C  CN(C)CCOC(=O)C1=CC=CC=C1    0.350120
