In [None]:
! pip install rdkit-pypi
! pip install pubchempy
! apt-get -qq install -y openbabel

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.Chem.ChemUtils import SDFToCSV


In [None]:
# Querys
sulfonamide = Chem.MolFromSmiles('OC1=C2C=CC=CC2=C(S(=O)(N)=O)C=C1')
indolinone = Chem.MolFromSmiles('O=C1NC2=C(C=CC=C2)C1')
curcumin = Chem.MolFromSmiles('O=C(/C=C/C1=CC=C(O)C(OC)=C1)CC(/C=C/C2=CC(OC)=C(O)C=C2)=O')
flavanol = Chem.MolFromSmiles('OC1(C2=CC=CC=C2)CCC3=C(O1)C=CC=C3')
stilbene = Chem.MolFromSmiles('C1(/C=C/C2=CC=CC=C2)=CC=CC=C1')

In [None]:
# Functions
def NewDataFrameFromSMILES(DataFrame,mol_list,query):
  NewDataFrame = DataFrame.copy()
  for i in range(len(mol_list)):
    try:
      mol_list[i].GetSubstructMatch(query)
    except:
      NewDataFrame.drop(i, inplace=True)

  NewDataFrame.reset_index(drop=True, inplace=True)
  mol_list_temp = [Chem.MolFromSmiles(x) for x in NewDataFrame['smiles']]
  match_list = [mol.GetSubstructMatch(query) for mol in mol_list_temp]

  res = [i for i in range(len(match_list)) if len(match_list[i]) == 0]
  NewDataFrame.drop(res, inplace=True)
  NewDataFrame.reset_index(drop=True, inplace=True)

  return NewDataFrame

# **PubChem**

## *(4-hydroxynaphthalen-1-yl)sulfonamide*

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/PubChem_sulfonamide.csv'
df_PubChem_sulfonamide = pd.read_csv(url)
df_PubChem_sulfonamide['isosmiles'].isnull().sum()

0

In [None]:
new_df_PubChem_sulfonamide = pd.DataFrame()
new_df_PubChem_sulfonamide['smiles'] = df_PubChem_sulfonamide['isosmiles']
new_df_PubChem_sulfonamide.to_csv('PubChem_sulfonamide.csv', index=False)

## *indolinone*

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/PubChem_indolinone.csv'
df_PubChem_indolinone = pd.read_csv(url)
df_PubChem_indolinone['isosmiles'].isnull().sum()

0

In [None]:
new_df_PubChem_indolinone = pd.DataFrame()
new_df_PubChem_indolinone['smiles'] = df_PubChem_indolinone['isosmiles']
new_df_PubChem_indolinone.to_csv('PubChem_indolinone.csv', index=False)

## *curcumin*

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/PubChem_curcumin.csv'
df_PubChem_curcumin = pd.read_csv(url)
df_PubChem_curcumin['isosmiles'].isnull().sum()

0

In [None]:
new_df_PubChem_curcumin = pd.DataFrame()
new_df_PubChem_curcumin['smiles'] = df_PubChem_curcumin['isosmiles']
new_df_PubChem_curcumin.to_csv('PubChem_curcumin.csv', index=False)

## *Flavanol*

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/PubChem_flavanol.csv'
df_PubChem_flavanol = pd.read_csv(url)
df_PubChem_flavanol['isosmiles'].isnull().sum()

0

In [None]:
new_df_PubChem_flavanol = pd.DataFrame()
new_df_PubChem_flavanol['smiles'] = df_PubChem_flavanol['isosmiles']
new_df_PubChem_flavanol.to_csv('PubChem_flavanol.csv', index=False)

## *Stilben*

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/PubChem_stilbene.csv'
df_PubChem_stilbene = pd.read_csv(url)
df_PubChem_stilbene['isosmiles'].isnull().sum()

0

In [None]:
new_df_PubChem_stilbene = pd.DataFrame()
new_df_PubChem_stilbene['smiles'] = df_PubChem_stilbene['isosmiles']
new_df_PubChem_stilbene.to_csv('PubChem_stilbene.csv', index=False)

# **DrugBank**

In [None]:
# Creation of a .csv file from a .sdf file
f = open('DrugBank.csv', 'w')
suppl = Chem.SDMolSupplier('structures.sdf')
SDFToCSV.Convert(suppl, f)
f.close()

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/DrugBank.csv'
df_DrugBank_raw = pd.read_csv(url)
df_DrugBank = pd.DataFrame()
df_DrugBank['smiles'] = df_DrugBank_raw['SMILES.1']

In [None]:
mol_list = [Chem.MolFromSmiles(x) for x in df_DrugBank['smiles']]

[05:52:51] Explicit valence for atom # 0 N, 4, is greater than permitted
[05:52:51] Explicit valence for atom # 0 N, 4, is greater than permitted
[05:52:51] Explicit valence for atom # 0 N, 4, is greater than permitted
[05:52:51] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[05:52:51] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]'
[05:52:52] Unusual charge on atom 0 number of radical electrons set to zero
[05:52:53] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=

## *(4-hydroxynaphthalen-1-yl)sulfonamide*

In [None]:
new_df_DrugBank_sulfonamide = NewDataFrameFromSMILES(df_DrugBank,mol_list,sulfonamide)
new_df_DrugBank_sulfonamide.size

[05:53:17] Unusual charge on atom 0 number of radical electrons set to zero


0

## *Indolinone*

In [None]:
new_df_DrugBank_indolinone = NewDataFrameFromSMILES(df_DrugBank,mol_list,indolinone)
new_df_DrugBank_indolinone.size

[05:53:27] Unusual charge on atom 0 number of radical electrons set to zero


59

In [None]:
new_df_DrugBank_indolinone.to_csv('DrugBank_indolinone.csv', index=False)

## *Curcumin*

In [None]:
new_df_DrugBank_curcumin = NewDataFrameFromSMILES(df_DrugBank,mol_list,curcumin)
new_df_DrugBank_curcumin.size

[05:53:39] Unusual charge on atom 0 number of radical electrons set to zero


3

In [None]:
new_df_DrugBank_curcumin.to_csv('DrugBank_curcumin.csv', index=False)

## *Flavanol*

In [None]:
new_df_DrugBank_flavanol = NewDataFrameFromSMILES(df_DrugBank,mol_list,flavanol)
new_df_DrugBank_flavanol.size

[05:54:42] Unusual charge on atom 0 number of radical electrons set to zero


0

## *Stilbene*

In [None]:
new_df_DrugBank_stilbene = NewDataFrameFromSMILES(df_DrugBank,mol_list,stilbene)
new_df_DrugBank_stilbene.size

[05:54:50] Unusual charge on atom 0 number of radical electrons set to zero


58

In [None]:
new_df_DrugBank_stilbene.to_csv('DrugBank_stilbene.csv', index=False)

# **PeruNPDB**

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/PeruNPDB.csv'
df_PeruNPDB = pd.read_csv(url, names=['smiles'])
mol_list = [Chem.MolFromSmiles(x) for x in df_PeruNPDB['smiles']]

[03:01:43] Explicit valence for atom # 2 C, 5, is greater than permitted


## *(4-hydroxynaphthalen-1-yl)sulfonamide*

In [None]:
new_df_PeruNPDB_sulfonamide = NewDataFrameFromSMILES(df_PeruNPDB,mol_list,sulfonamide)
new_df_PeruNPDB_sulfonamide.size

0

## *Indolinone*

In [None]:
new_df_PeruNPDB_indolinone = NewDataFrameFromSMILES(df_PeruNPDB,mol_list,indolinone)
new_df_PeruNPDB_indolinone.size

11

In [None]:
new_df_PeruNPDB_indolinone.to_csv('PeruNPDB_indolinone.csv', index=False)

## *Curcumin*

In [None]:
new_df_PeruNPDB_curcumin = NewDataFrameFromSMILES(df_PeruNPDB,mol_list,curcumin)
new_df_PeruNPDB_curcumin.size

0

## *Flavanol*

In [None]:
new_df_PeruNPDB_flavanol = NewDataFrameFromSMILES(df_PeruNPDB,mol_list,flavanol)
new_df_PeruNPDB_flavanol.size

0

## *Stilbene*

In [None]:
new_df_PeruNPDB_stilbene = NewDataFrameFromSMILES(df_PeruNPDB,mol_list,stilbene)
new_df_PeruNPDB_stilbene.size

0

# **UNPD**

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/UNPD.csv'
df_UNPD = pd.read_csv(url)
mol_list = [Chem.MolFromSmiles(x) for x in df_UNPD['smiles']]

## *(4-hydroxynaphthalen-1-yl)sulfonamide*

In [None]:
new_df_UNPD_sulfonamide = NewDataFrameFromSMILES(df_UNPD,mol_list,sulfonamide)
new_df_UNPD_sulfonamide.size

0

## *Indolinone*

In [None]:
new_df_UNPD_indolinone = NewDataFrameFromSMILES(df_UNPD,mol_list,indolinone)
new_df_UNPD_indolinone.size

546

In [None]:
new_df_UNPD_indolinone.to_csv('UNPD_indolinone.csv', index=False)

## *Curcumin*

In [None]:
new_df_UNPD_curcumin = NewDataFrameFromSMILES(df_UNPD,mol_list,curcumin)
new_df_UNPD_curcumin.size

11

In [None]:
new_df_UNPD_curcumin.to_csv('UNPD_curcumin.csv', index=False)

## *Flavanol*

In [None]:
new_df_UNPD_flavanol = NewDataFrameFromSMILES(df_UNPD,mol_list,flavanol)
new_df_UNPD_flavanol.size

272

In [None]:
new_df_UNPD_flavanol.to_csv('UNPD_flavanol.csv', index=False)

## *Stilbene*

In [None]:
new_df_UNPD_stilbene = NewDataFrameFromSMILES(df_UNPD,mol_list,stilbene)
new_df_UNPD_stilbene.size

1029

In [None]:
new_df_UNPD_stilbene.to_csv('UNPD_stilbene.csv', index=False)

# **NuBBE**

In [None]:
url = 'https://raw.githubusercontent.com/cpariona/biomedical-thesis/main/data/raw/NuBBE.csv'
df_NuBBE = pd.read_csv(url, names=['smiles'])
mol_list = [Chem.MolFromSmiles(x) for x in df_NuBBE['smiles']]

[12:46:39] SMILES Parse Error: syntax error while parsing: SMILES
[12:46:39] SMILES Parse Error: Failed parsing SMILES 'SMILES' for input: 'SMILES'


## *(4-hydroxynaphthalen-1-yl)sulfonamide*

In [None]:
new_df_NuBBE_sulfonamide = NewDataFrameFromSMILES(df_NuBBE,mol_list,sulfonamide)
new_df_NuBBE_sulfonamide.size

0

## *Indolinone*

In [None]:
new_df_NuBBE_indolinone = NewDataFrameFromSMILES(df_NuBBE,mol_list,indolinone)
new_df_NuBBE_indolinone.size

4

In [None]:
new_df_NuBBE_indolinone.to_csv('NuBBE_indolinone.csv', index=False)

## *Curcumin*

In [None]:
new_df_NuBBE_curcumin = NewDataFrameFromSMILES(df_NuBBE,mol_list,curcumin)
new_df_NuBBE_curcumin.size

0

## *Flavanol*

In [None]:
new_df_NuBBE_flavanol = NewDataFrameFromSMILES(df_NuBBE,mol_list,flavanol)
new_df_NuBBE_flavanol.size

0

## *Stilbene*

In [None]:
new_df_NuBBE_stilbene = NewDataFrameFromSMILES(df_NuBBE,mol_list,stilbene)
new_df_NuBBE_stilbene.size

7

In [None]:
new_df_NuBBE_stilbene.to_csv('NuBBE_stilbene.csv', index=False)