## Importing Libraries



In [1]:
!pip install selfies



In [2]:
import selfies as sf
import pandas as pd

## Testing Encoding and Decoding on One Smiles Molecule

In [3]:
original_smiles = "CC(C)O"

In [4]:
encoded_selfies = sf.encoder(original_smiles)
decoded_smiles = sf.decoder(encoded_selfies)

In [5]:
print(encoded_selfies)
print(decoded_smiles)

[C][C][Branch1][C][C][O]
CC(C)O


## Preprocessing QM9 Smiles Dataset

In [6]:
smiles_df = pd.read_csv("OriginalSMILES_QM9.csv")

In [7]:
smiles_df = smiles_df.drop('idx', axis=1)

### Finding the Count of Each Element in Each Compound

In [8]:
carbons = [] # lists to store number of each element in each compound
nitrogens = []
oxygens = []
fluorines = []

for i in range(len(smiles_df)): # goes through each molecule, converts to all lowercase, and appends count of each element
  molecule = smiles_df['smiles'][i].lower()
  carbons.append(molecule.count('c'))
  nitrogens.append(molecule.count('n'))
  oxygens.append(molecule.count('o'))
  fluorines.append(molecule.count('f'))

In [9]:
smiles_df['carbons'] = carbons # add count to dataframe
smiles_df['nitrogens'] = nitrogens
smiles_df['oxygens'] = oxygens
smiles_df['fluorines'] = fluorines

In [10]:
smiles_df

Unnamed: 0,smiles,carbons,nitrogens,oxygens,fluorines
0,C,1,0,0,0
1,N,0,1,0,0
2,O,0,0,1,0
3,C#C,2,0,0,0
4,C#N,1,1,0,0
...,...,...,...,...,...
132035,C1C2C3C4C5OC14C5N23,7,1,1,0
132036,C1N2C3C2C2C4OC12C34,7,1,1,0
132037,C1N2C3C4C5C2C13CN45,7,2,0,0
132038,C1N2C3C4C5CC13C2C45,8,1,0,0


In [11]:
smiles_df = smiles_df.drop(smiles_df[smiles_df.fluorines > 0].index) # drop all data points with atleast one fluorine

In [12]:
smiles_df

Unnamed: 0,smiles,carbons,nitrogens,oxygens,fluorines
0,C,1,0,0,0
1,N,0,1,0,0
2,O,0,0,1,0
3,C#C,2,0,0,0
4,C#N,1,1,0,0
...,...,...,...,...,...
132035,C1C2C3C4C5OC14C5N23,7,1,1,0
132036,C1N2C3C2C2C4OC12C34,7,1,1,0
132037,C1N2C3C4C5C2C13CN45,7,2,0,0
132038,C1N2C3C4C5CC13C2C45,8,1,0,0


In [13]:
smiles_df = smiles_df.drop(columns='fluorines')

In [14]:
smiles_df = smiles_df.reset_index()

In [15]:
smiles_df

Unnamed: 0,index,smiles,carbons,nitrogens,oxygens
0,0,C,1,0,0
1,1,N,0,1,0
2,2,O,0,0,1
3,3,C#C,2,0,0
4,4,C#N,1,1,0
...,...,...,...,...,...
129880,132035,C1C2C3C4C5OC14C5N23,7,1,1
129881,132036,C1N2C3C2C2C4OC12C34,7,1,1
129882,132037,C1N2C3C4C5C2C13CN45,7,2,0
129883,132038,C1N2C3C4C5CC13C2C45,8,1,0


In [16]:
smiles_csv_data = smiles_df.to_csv('SMILES_QM9.csv', index = True) # save updated dataframe as a csv file

### Finding Max of Each Element

In [17]:
carbon_max = smiles_df['carbons'].max()
nitrogen_max = smiles_df['nitrogens'].max()
oxygen_max = smiles_df['oxygens'].max()

print(f'Max Carbons: {carbon_max}')
print(f'Max Nitrogens: {nitrogen_max}')
print(f'Max Oxygens: {oxygen_max}')

Max Carbons: 9
Max Nitrogens: 7
Max Oxygens: 5


## Coverting Smiles Dataset to Selfies Dataset

In [18]:
smiles_dataset = smiles_df['smiles'].values.tolist()

In [19]:
selfies_dataset = list(map(sf.encoder, smiles_dataset))

In [20]:
selfies_df = pd.DataFrame(selfies_dataset, columns = ['selfies'])
selfies_df['carbons'] = smiles_df['carbons'].values.tolist() # add count to dataframe
selfies_df['nitrogens'] = smiles_df['nitrogens'].values.tolist()
selfies_df['oxygens'] = smiles_df['oxygens'].values.tolist()

In [21]:
selfies_df

Unnamed: 0,selfies,carbons,nitrogens,oxygens
0,[C],1,0,0
1,[N],0,1,0
2,[O],0,0,1
3,[C][#C],2,0,0
4,[C][#N],1,1,0
...,...,...,...,...
129880,[C][C][C][C][C][O][C][Ring1][#Branch1][Ring1][...,7,1,1
129881,[C][N][C][C][Ring1][Ring1][C][C][O][C][Ring1][...,7,1,1
129882,[C][N][C][C][C][C][Ring1][Branch1][C][Ring1][#...,7,2,0
129883,[C][N][C][C][C][C][C][Ring1][#Branch1][Ring1][...,8,1,0


In [22]:
selfies_csv_data = selfies_df.to_csv('SELFIES_QM9.csv', index = True)