## Importing Libraries



In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [2]:
!mamba install -c conda-forge rdkit


                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (0.8.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████


Looking for: ['rdkit']

conda-forge/linux-64     Using cache
conda-forge/noarch       Using cache
pkgs/main/linux

In [3]:
!pip install selfies
!pip install pubchempy



In [4]:
import selfies as sf
import pandas as pd
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rmd

## Testing Encoding and Decoding on One Smiles Molecule

In [5]:
original_smiles = "Clc1c(Cl)c(Cl)c(c(Cl)c1Cl)c2c(Cl)c(Cl)c(Cl)c(Cl)c2Cl"

In [6]:
encoded_selfies = sf.encoder(original_smiles)
decoded_smiles = sf.decoder(encoded_selfies)

In [7]:
print(encoded_selfies)
print(decoded_smiles)

[Cl][C][=C][Branch1][C][Cl][C][Branch1][C][Cl][=C][Branch1][=Branch2][C][Branch1][C][Cl][=C][Ring1][=Branch2][Cl][C][=C][Branch1][C][Cl][C][Branch1][C][Cl][=C][Branch1][C][Cl][C][Branch1][C][Cl][=C][Ring1][#Branch2][Cl]
ClC1=C(Cl)C(Cl)=C(C(Cl)=C1Cl)C2=C(Cl)C(Cl)=C(Cl)C(Cl)=C2Cl


## Preprocessing QM9 Smiles Dataset

In [8]:
#smiles_df = pd.read_csv("OriginalSMILES_QM9.csv")
smiles_df = pd.read_csv("smiles_data.csv")

In [9]:
smiles_df = smiles_df.drop('Unnamed: 0', axis=1)

In [10]:
smiles_df

Unnamed: 0,SMILES
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C
1,O=C1Nc2cccc3cccc1c23
2,Clc1ccc(C=O)cc1
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...
...,...
9977,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C
9978,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...
9979,c1(cc(ccc1C(C)C)C)O
9980,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...


### Finding the Count of C, N, O, F in Each Compound

In [11]:
carbons = [] # lists to store number of each element in each compound
nitrogens = []
oxygens = []
fluorines = []

for i in range(len(smiles_df)): # goes through each molecule, converts to all lowercase, and appends count of each element
  smiles_df['SMILES'][i] = smiles_df['SMILES'][i].strip()
  molecule = smiles_df['SMILES'][i].lower()
  carbons.append(molecule.count('c'))
  nitrogens.append(molecule.count('n'))
  oxygens.append(molecule.count('o'))
  fluorines.append(molecule.count('f'))

In [12]:
smiles_df['carbons'] = carbons # add count to dataframe
smiles_df['nitrogens'] = nitrogens
smiles_df['oxygens'] = oxygens
smiles_df['fluorines'] = fluorines

### Finding Count of H in Each Compound

#### Get Molecular Formula of SMILES Molecule

In [13]:
hydrogens = [] # list to store number of hydrogrens in each compound

for i in range(len(smiles_df)): # goes through each molecule and gets molecular formula
  smiles = smiles_df['SMILES'][i]
  compound = Chem.MolFromSmiles(smiles) # finds the molecular formula of the smiles
  compound = rmd.CalcMolFormula(compound)
  
  countH = 0
  for i in range(len(compound)):
    if compound[i] == 'H':
      idx = 1
      countStr = ''
      while (ord(compound[i + idx]) >= 48 and ord(compound[i + idx]) <= 57):
        countStr += compound[i + idx]
        if ((i + idx + 1) == len(compound)):
          break
        idx += 1
      if (countStr == ''):
        countH = 1
      else:
        countH = int(countStr)
      break
  hydrogens.append(countH)

smiles_df['hydrogens'] = hydrogens

In [14]:
smiles_df

Unnamed: 0,SMILES,carbons,nitrogens,oxygens,fluorines,hydrogens
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,21,1,0,0,46
1,O=C1Nc2cccc3cccc1c23,11,1,1,0,7
2,Clc1ccc(C=O)cc1,8,0,1,0,5
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,46,1,6,0,42
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,25,2,4,0,30
...,...,...,...,...,...,...
9977,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C,15,2,2,0,24
9978,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...,22,2,8,0,24
9979,c1(cc(ccc1C(C)C)C)O,10,0,1,0,14
9980,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,27,2,4,0,38


In [15]:
smiles_df = smiles_df.drop(smiles_df[smiles_df.fluorines > 0].index) # drop all data points with atleast one fluorine

In [16]:
smiles_df

Unnamed: 0,SMILES,carbons,nitrogens,oxygens,fluorines,hydrogens
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,21,1,0,0,46
1,O=C1Nc2cccc3cccc1c23,11,1,1,0,7
2,Clc1ccc(C=O)cc1,8,0,1,0,5
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,46,1,6,0,42
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,25,2,4,0,30
...,...,...,...,...,...,...
9977,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C,15,2,2,0,24
9978,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...,22,2,8,0,24
9979,c1(cc(ccc1C(C)C)C)O,10,0,1,0,14
9980,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,27,2,4,0,38


In [17]:
smiles_df = smiles_df.drop(columns='fluorines')

In [18]:
smiles_df = smiles_df.reset_index()

In [19]:
smiles_df = smiles_df.drop('index', axis=1)

In [20]:
smiles_df

Unnamed: 0,SMILES,carbons,nitrogens,oxygens,hydrogens
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,21,1,0,46
1,O=C1Nc2cccc3cccc1c23,11,1,1,7
2,Clc1ccc(C=O)cc1,8,0,1,5
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,46,1,6,42
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,25,2,4,30
...,...,...,...,...,...
9467,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C,15,2,2,24
9468,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...,22,2,8,24
9469,c1(cc(ccc1C(C)C)C)O,10,0,1,14
9470,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,27,2,4,38


In [21]:
smiles_csv_data = smiles_df.to_csv('SMILES_data.csv', index = True) # save updated dataframe as a csv file

### Finding Max of Each Element

In [22]:
carbon_max = smiles_df['carbons'].max()
nitrogen_max = smiles_df['nitrogens'].max()
oxygen_max = smiles_df['oxygens'].max()
hydrogen_max = smiles_df['hydrogens'].max()

print(f'Max Carbons: {carbon_max}')
print(f'Max Nitrogens: {nitrogen_max}')
print(f'Max Oxygens: {oxygen_max}')
print(f'Max Hydrogens: {hydrogen_max}')

Max Carbons: 299
Max Nitrogens: 32
Max Oxygens: 89
Max Hydrogens: 336


## Coverting Smiles Dataset to Selfies Dataset

In [23]:
smiles_dataset = smiles_df['SMILES'].values.tolist()

In [24]:
smiles_dataset[0:10]

['[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C',
 'O=C1Nc2cccc3cccc1c23',
 'Clc1ccc(C=O)cc1',
 '[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)C([O-])=O.CC(c4ccccc4)c5cc(C(C)c6ccccc6)c(O)c(c5)C([O-])=O',
 'C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO6)cc3',
 'Cc1cccc(C=C)c1',
 'CCC1CCC(CCC(O)=O)C1',
 'CC12CC(O)C3C(CCC4=CC(=O)C=CC34C)C1CC(O)C2(O)C(=O)CO',
 'O=C(OCCCOCCCOC(=O)c1ccccc1)c2ccccc2',
 'CN(C)c1ccc(C=O)cc1']

In [25]:
sf.set_semantic_constraints(bond_constraints='hypervalent')

In [26]:
selfies_dataset = list(map(sf.encoder, smiles_dataset))

In [27]:
selfies_df = pd.DataFrame(selfies_dataset, columns = ['selfies'])
selfies_df['carbons'] = smiles_df['carbons'].values.tolist() # add count to dataframe
selfies_df['nitrogens'] = smiles_df['nitrogens'].values.tolist()
selfies_df['oxygens'] = smiles_df['oxygens'].values.tolist()
selfies_df['hydrogens'] = smiles_df['hydrogens'].values.tolist()

In [28]:
selfies_df

Unnamed: 0,selfies,carbons,nitrogens,oxygens,hydrogens
0,[Br-1].[C][C][C][C][C][C][C][C][C][C][C][C][C]...,21,1,0,46
1,[O][=C][N][C][=C][C][=C][C][=C][C][=C][C][Ring...,11,1,1,7
2,[Cl][C][=C][C][=C][Branch1][Ring1][C][=O][C][=...,8,0,1,5
3,[Zn+2].[C][C][Branch1][=Branch2][C][=C][C][=C]...,46,1,6,42
4,[C][O][C][Ring1][Ring1][C][N][Branch1][#Branch...,25,2,4,30
...,...,...,...,...,...
9467,[C][Branch1][S][C][=C][C][=C][Branch1][Branch1...,15,2,2,24
9468,[O][C][=C][Branch2][Branch1][N][C][Branch2][Br...,22,2,8,24
9469,[C][=Branch1][S][=C][C][=Branch1][O][=C][C][=C...,10,0,1,14
9470,[C][O][C][=C][C][=C][Branch2][Ring2][=Branch1]...,27,2,4,38


In [29]:
selfies_csv_data = selfies_df.to_csv('SELFIES_data.csv', index = True)