## Importing Libraries



In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [2]:
!mamba install -c conda-forge rdkit


                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (0.8.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████


Looking for: ['rdkit']

conda-forge/linux-64     Using cache
conda-forge/noarch       Using cache
pkgs/main/linux

In [3]:
!pip install selfies
!pip install pubchempy



In [4]:
import selfies as sf
import pandas as pd
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rmd

## Testing Encoding and Decoding on One Smiles Molecule

In [5]:
original_smiles = "CC(C)O"

In [6]:
encoded_selfies = sf.encoder(original_smiles)
decoded_smiles = sf.decoder(encoded_selfies)

In [7]:
print(encoded_selfies)
print(decoded_smiles)

[C][C][Branch1][C][C][O]
CC(C)O


## Preprocessing QM9 Smiles Dataset

In [8]:
smiles_df = pd.read_csv("OriginalSMILES_QM9.csv")

In [9]:
smiles_df = smiles_df.drop('idx', axis=1)

In [10]:
smiles_df

Unnamed: 0,smiles
0,C
1,N
2,O
3,C#C
4,C#N
...,...
132035,C1C2C3C4C5OC14C5N23
132036,C1N2C3C2C2C4OC12C34
132037,C1N2C3C4C5C2C13CN45
132038,C1N2C3C4C5CC13C2C45


### Finding the Count of C, N, O, F in Each Compound

In [11]:
carbons = [] # lists to store number of each element in each compound
nitrogens = []
oxygens = []
fluorines = []

for i in range(len(smiles_df)): # goes through each molecule, converts to all lowercase, and appends count of each element
  molecule = smiles_df['smiles'][i].lower()
  carbons.append(molecule.count('c'))
  nitrogens.append(molecule.count('n'))
  oxygens.append(molecule.count('o'))
  fluorines.append(molecule.count('f'))

In [12]:
smiles_df['carbons'] = carbons # add count to dataframe
smiles_df['nitrogens'] = nitrogens
smiles_df['oxygens'] = oxygens
smiles_df['fluorines'] = fluorines

### Finding Count of H in Each Compound

#### Get Molecular Formula of SMILES Molecule

In [13]:
hydrogens = [] # list to store number of hydrogrens in each compound

for i in range(len(smiles_df)): # goes through each molecule and gets molecular formula
  smiles = smiles_df['smiles'][i]
  compound = Chem.MolFromSmiles(smiles) # finds the molecular formula of the smiles
  compound = rmd.CalcMolFormula(compound)
  
  countH = 0
  for i in range(len(compound)):
    if compound[i] == 'H':
      idx = 1
      countStr = ''
      while (ord(compound[i + idx]) >= 48 and ord(compound[i + idx]) <= 57):
        countStr += compound[i + idx]
        if ((i + idx + 1) == len(compound)):
          break
        idx += 1
      if (countStr == ''):
        countH = 1
      else:
        countH = int(countStr)
      break
  hydrogens.append(countH)

smiles_df['hydrogens'] = hydrogens

In [14]:
smiles_df

Unnamed: 0,smiles,carbons,nitrogens,oxygens,fluorines,hydrogens
0,C,1,0,0,0,4
1,N,0,1,0,0,3
2,O,0,0,1,0,2
3,C#C,2,0,0,0,2
4,C#N,1,1,0,0,1
...,...,...,...,...,...,...
132035,C1C2C3C4C5OC14C5N23,7,1,1,0,7
132036,C1N2C3C2C2C4OC12C34,7,1,1,0,7
132037,C1N2C3C4C5C2C13CN45,7,2,0,0,8
132038,C1N2C3C4C5CC13C2C45,8,1,0,0,9


In [15]:
smiles_df = smiles_df.drop(smiles_df[smiles_df.fluorines > 0].index) # drop all data points with atleast one fluorine

In [16]:
smiles_df

Unnamed: 0,smiles,carbons,nitrogens,oxygens,fluorines,hydrogens
0,C,1,0,0,0,4
1,N,0,1,0,0,3
2,O,0,0,1,0,2
3,C#C,2,0,0,0,2
4,C#N,1,1,0,0,1
...,...,...,...,...,...,...
132035,C1C2C3C4C5OC14C5N23,7,1,1,0,7
132036,C1N2C3C2C2C4OC12C34,7,1,1,0,7
132037,C1N2C3C4C5C2C13CN45,7,2,0,0,8
132038,C1N2C3C4C5CC13C2C45,8,1,0,0,9


In [17]:
smiles_df = smiles_df.drop(columns='fluorines')

In [18]:
smiles_df = smiles_df.reset_index()

In [19]:
smiles_df = smiles_df.drop('index', axis=1)

In [20]:
smiles_df

Unnamed: 0,smiles,carbons,nitrogens,oxygens,hydrogens
0,C,1,0,0,4
1,N,0,1,0,3
2,O,0,0,1,2
3,C#C,2,0,0,2
4,C#N,1,1,0,1
...,...,...,...,...,...
129880,C1C2C3C4C5OC14C5N23,7,1,1,7
129881,C1N2C3C2C2C4OC12C34,7,1,1,7
129882,C1N2C3C4C5C2C13CN45,7,2,0,8
129883,C1N2C3C4C5CC13C2C45,8,1,0,9


In [21]:
smiles_csv_data = smiles_df.to_csv('SMILES_QM9.csv', index = True) # save updated dataframe as a csv file

### Finding Max of Each Element

In [22]:
carbon_max = smiles_df['carbons'].max()
nitrogen_max = smiles_df['nitrogens'].max()
oxygen_max = smiles_df['oxygens'].max()
hydrogen_max = smiles_df['hydrogens'].max()

print(f'Max Carbons: {carbon_max}')
print(f'Max Nitrogens: {nitrogen_max}')
print(f'Max Oxygens: {oxygen_max}')
print(f'Max Hydrogens: {hydrogen_max}')

Max Carbons: 9
Max Nitrogens: 7
Max Oxygens: 5
Max Hydrogens: 20


## Coverting Smiles Dataset to Selfies Dataset

In [23]:
smiles_dataset = smiles_df['smiles'].values.tolist()

In [24]:
selfies_dataset = list(map(sf.encoder, smiles_dataset))

In [25]:
selfies_df = pd.DataFrame(selfies_dataset, columns = ['selfies'])
selfies_df['carbons'] = smiles_df['carbons'].values.tolist() # add count to dataframe
selfies_df['nitrogens'] = smiles_df['nitrogens'].values.tolist()
selfies_df['oxygens'] = smiles_df['oxygens'].values.tolist()
selfies_df['hydrogens'] = smiles_df['hydrogens'].values.tolist()

In [26]:
selfies_df

Unnamed: 0,selfies,carbons,nitrogens,oxygens,hydrogens
0,[C],1,0,0,4
1,[N],0,1,0,3
2,[O],0,0,1,2
3,[C][#C],2,0,0,2
4,[C][#N],1,1,0,1
...,...,...,...,...,...
129880,[C][C][C][C][C][O][C][Ring1][#Branch1][Ring1][...,7,1,1,7
129881,[C][N][C][C][Ring1][Ring1][C][C][O][C][Ring1][...,7,1,1,7
129882,[C][N][C][C][C][C][Ring1][Branch1][C][Ring1][#...,7,2,0,8
129883,[C][N][C][C][C][C][C][Ring1][#Branch1][Ring1][...,8,1,0,9


In [27]:
selfies_csv_data = selfies_df.to_csv('SELFIES_QM9.csv', index = True)