# Cleaning and standardizing all data
#### Author: Ellinor Samuelsson Hoppe

In [None]:
import warnings
warnings.filterwarnings('ignore', category = RuntimeWarning)

import re

import pandas as pd
import numpy as np

import cirpy

import rdkit
from rdkit import Chem, rdBase, RDLogger, DataStructs
RDLogger.DisableLog('rdApp.*')
from rdkit.Chem import AllChem, Draw, inchi, rdDepictor, PandasTools, SaltRemover
from rdkit.Chem.Draw import IPythonConsole, rdMolDraw2D
from rdkit.Chem.MolStandardize import rdMolStandardize

import pickle

In [None]:
# Tox21 data
tox21 = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Tox21/Tox21_final_7691_compounds.tsv', sep='\t')
tox21.replace(to_replace = -999999, value = np.nan, inplace=True)

In [None]:
# SIRIUS training data
sirius_neg = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/SIRIUS training set/SIRIUS_5_training_compounds_negative.txt', sep='\t')
sirius_neg['ionmode'] = 'negative'
sirius_pos = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/SIRIUS training set/SIRIUS_5_training_compounds_positive.txt', sep='\t')
sirius_pos['ionmode'] = 'positive'
sirius = pd.concat([sirius_neg, sirius_pos])

In [None]:
sirius.head()

In [None]:
#APCI data collected from MassBank, MoNA and GNPS
apci = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/APCI data/241203_APCI_Unique_Chemicals.csv', sep = ';')
apci = apci[apci['DataBank'] != 'GNPS'] #removing GNPS data

#Iris dataset
#iris_apci = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/MS data - databases/APCI/Iris_data_SMILES_used_apci.csv')


#KLARA data from Gordian
klara = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/KLARA/KLARA_chemicals_2024-06-24_std_smiles_std_inchikeys_logP_molecular_formula_molecular_mass_C_number_N_number.csv', sep=',')

#Isabelles data
isabelle = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Isabelles_chemicals.csv', sep=';')

### Cleaning of data (Code adapted from Ida Rahu; https://github.com/kruvelab/NTS_LC_HRMS/blob/main/NTS_review.ipynb)

In [None]:
#Identifying disconnected structures, eliminating inorganic ions and solvent molecules, and 
# neutralizing remaining ions, while also removing information about stereochemistry

def standardize_mol(data):
  # Gets mol objects from SMILES
  PandasTools.AddMoleculeColumnToFrame(data, 'SMILES', 'ROMol') # Assuming that SMILES notations are given in column SMILES
  #data['ROMol'] = data.InChI.apply(lambda x: Chem.MolFromInchi(x))
  
  def remove_ions(mol, ions):
    remover = SaltRemover.SaltRemover(defnData=ions)
    return remover.StripMol(mol)

  parts2remove = ['[F,Cl,Br,I]', '[Na,Mg,K,Ca,Li,Ba]', 'CC(=O)O', '[O,N]', 'CS(=O)(=O)O', 'O=S(=O)(O)O', 'O=[N+]([O-])O', 'O=S(=O)(O)CCO',
                  'F[P-](F)(F)(F)(F)F' 'O=S(=O)([O-])C(F)(F)F', 'F[B-](F)(F)F',
                  '[Co,Pd,Ni,Al,Sn,Zn,Cu,Hg]'] # additional ions to remove

  for part in parts2remove:
    data['ROMol'] = data.ROMol.apply(lambda x: remove_ions(x, part))

  uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
  data['ROMol'] = data.ROMol.apply(lambda x: uncharger.uncharge(x))

  def neutralize_atoms(mol):
      try:
          pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
          at_matches = mol.GetSubstructMatches(pattern)
          at_matches_list = [y[0] for y in at_matches]
          if len(at_matches_list) > 0:
              for at_idx in at_matches_list:
                  atom = mol.GetAtomWithIdx(at_idx)
                  chg = atom.GetFormalCharge()
                  hcount = atom.GetTotalNumHs()
                  atom.SetFormalCharge(0)
                  atom.SetNumExplicitHs(hcount - chg)
                  atom.UpdatePropertyCache()
          return mol
      except:
          return mol

  data['ROMol'] = data.ROMol.apply(lambda x: neutralize_atoms(x))
  data.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
  data['SMILES'] = data.ROMol.apply(lambda x: Chem.MolToSmiles(x))
  data['InChIKey'] = data.ROMol.apply(lambda x: inchi.MolToInchiKey(x))
  data['InChIKey14'] = data.InChIKey.apply(lambda x: x.split('-')[0])
  return data


In [None]:
def inchi_to_smiles(inchi):
  try:
    mol = Chem.MolFromInchi(inchi)
    return Chem.MolToSmiles(mol)
  except:
    return None

In [None]:
sirius['ROMol'] = sirius.InChI.apply(lambda x: Chem.MolFromInchi(x))

In [None]:
sirius_std = standardize_mol(sirius)

In [None]:
#Standardize and deduplicate compounds in tox21 dataset
tox21_std = standardize_mol(tox21)

In [None]:
#Standardize and deduplicate compounds in APCI data from MassBank and MoNA
apci['SMILES'] = apci['InChI'].apply(inchi_to_smiles)
apci_std = standardize_mol(apci)


In [None]:
#Standardize and deduplicate compounds from Iris dataset
iris_std = standardize_mol(iris_apci)

In [None]:
# Standardize and deduplicate compounds from Isabelles dataset
#Make sure to run in 
isabelle_std = standardize_mol(isabelle)

In [None]:
def filter_smiles(smiles):
    split_smiles = str(smiles).split('.')


    def all_equal(smiles_list):             # Check if all SMILES in split list are equal
        smiles_list = iter(smiles_list)
        try:
            first = next(smiles_list)
        except StopIteration:               #If list empty, return True
            return True
        return all(first == rest for rest in smiles_list)


    if all_equal(split_smiles) == True:     # If all SMILES are equal, return the first SMILES element
        return split_smiles[0]
    else:                                   # If not, return None
        return None

In [None]:
#KLARA data cleaned
klara = klara.rename(columns = {'Smiles': 'SMILES'}) #rename column from Smiles to SMILES
klara = klara.dropna(subset = ['SMILES']) #drop rows with missing SMILES
klara = klara.drop_duplicates(subset = 'SMILES') #drop duplicates based on SMILES

PandasTools.AddMoleculeColumnToFrame(klara, 'SMILES', 'ROMol')  #add ROMol column to klara dataframe based on SMILES column

def contains_carbon(mol):
    try:
        atomic_no = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        return bool(6 in atomic_no)
    except:
        return np.nan

klara['contains_carbon'] = klara.ROMol.apply(lambda x: contains_carbon(x)) #Boolean column based on whether the compound contains carbon or not
klara = klara[klara.contains_carbon == True].reset_index(drop=True) #Remove compounds that do not contain carbon

klara_std = standardize_mol(klara) #Standardize and deduplicate compounds in KLARA dataset
klara_std.drop(columns = 'contains_carbon', inplace = True) #Drop column contains_carbon from klara dataframe
klara_std = klara_std.drop(klara[klara['InChIKey']==''].index) #Drop rows with missing InChIKey

klara_std['SMILES'] = klara_std['SMILES'].apply(lambda x: filter_smiles(x)) #Filter out salts from SMILES column

klara_std.drop(columns = ['split_SMILES', 'filter', 'std_smiles','std_inchikey', 'logP', 'C_number', 'N_number', 'Streckkod', 'Molekylvikt', 'Kommentar'], inplace = True) #drop unnecessary columns


In [None]:
#Save all standardizations into csv files
#tox21_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Tox21_chemicals_STD.csv', index = False, sep = '\t')
#apci_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/APCI_chemicals_STD.csv', index = False, sep = '\t')
#iris_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Iris_chemicals_STD.csv', index = False, sep = '\t')
isabelle_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Isabelles_chemicals_STD.csv', index = False, sep = '\t')
#klara_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/KLARA_chemicals_STD.csv', index = False, sep = '\t')


# EI data from MassBank, MoNA and GNPS

In [None]:
# Import files
MassBank = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/MS data - databases/EI/MassBank/2024-12-20_All_EI_Compounds_MassBank.csv', sep = '\t')
MoNA = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/MS data - databases/MoNA/2024-12-18_MoNA_EI_data.csv', sep = ';')
GNPS = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/MS data - databases/GNPS/2024-12-18_GNPS_EI_data.csv', sep = ';')

In [None]:
#Rename columns in MoNA dataset
MoNA.rename(columns = {'molecular_formula' : 'smiles', 'SMILES' : 'cas', 'cas' : 'molecular_formula'}, inplace = True)
GNPS = GNPS.rename(columns = {'Smiles' : 'SMILES'})

In [None]:
# Add DataBank label to each dataset
MassBank['DataBank'] = 'MassBank'
MoNA['DataBank'] = 'MoNA'
GNPS['DataBank'] = 'GNPS' # <----- Only one datapoint in GNPS data

In [None]:
# Remove NA values from InChI columns
MassBank_filter = MassBank.dropna(subset = ['InChI'], inplace = True)

MoNA_filter = MoNA.dropna(subset = ['InChI'], inplace = True)

In [None]:
# Drop duplicates based on InChI
MassBank_filter = MassBank.drop_duplicates(subset = 'InChI')

MoNA_filter = MoNA.drop_duplicates(subset = 'InChI')

In [None]:
# Add SMILES column to MassBank and MoNA datasets
MassBank_filter['SMILES'] = MassBank_filter['InChI'].apply(lambda x :inchi_to_smiles(x))

MoNA_filter['SMILES'] = MoNA_filter['InChI'].apply(lambda x : inchi_to_smiles(x))

In [None]:
# Standardize and deduplicate compounds in MassBank, MoNA and GNPS datasets
MassBank_std = standardize_mol(MassBank_filter)
MoNA_std = standardize_mol(MoNA_filter)
GNPS_std = standardize_mol(GNPS)

In [None]:
pd.options.display.max_colwidth = 1000
MassBank_std[MassBank_std['SMILES'].str.contains('\.')]['SMILES']

In [None]:
MassBank_std['SMILES'] = MassBank_std['SMILES'].apply(filter_smiles)
MassBank_std = MassBank_std.dropna(subset = 'SMILES')
# MassBank_std = MassBank_std.drop_duplicates(subset = 'SMILES')

In [None]:
MassBank_std['InChIKey14'] = MassBank_std.InChIKey.apply(lambda x: x.split('-')[0])

In [None]:
# Save all standardizations into csv files
MassBank_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/2024-12-20_MassBank_EI_chemicals_STD.csv', index = False, sep = '\t')
MoNA_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/2024-12-20_MoNA_EI_chemicals_STD.csv', index = False, sep = '\t')
GNPS_std.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/2024-12-20_GNPS_EI_chemicals_STD.csv', index = False, sep = '\t')

### Clean SusDat data from NORMAN

In [None]:
#NORMAN SusDat dataset
susdat_full_df = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/susdat_2025-01-30-135733.csv')

GC-probability is based on a 0-1 scale, where >0.5 means that it's most likely GC-amenable, while <0.5 means it's most likely RPLC amenable. The RPLC-affinity is equal to 1-(gc-affinity probability).

In [None]:
susdat = susdat_full_df[['Name', 'Validation_Level', 'SMILES', 'StdInChI', 'Prob. of GC','alogp_ChemSpider', 'xlogp_ChemSpider']]

susdat = susdat.rename(columns = {'Name':'name', 
                                  'Validation_Level':'validation_level', 
                                  'StdInChI':'InChI', 
                                  'Prob. of GC':'gc_probability', 
                                  'alogp_ChemSpider':'alogp',
                                  'xlogp_ChemSpider':'xlogp',
                                  'SMILES':'SMILES_orginal'})

In [None]:
susdat.dtypes

In [None]:
susdat_nona = susdat.dropna(subset = ['SMILES_orginal', 'gc_probability'])

Tested to determine if any InChI were available where SMILES were not for compounds with GC-amenability probabilty, but none were so this step was removed. 

In [None]:
susdat_nona = susdat_nona.copy() #Makes a copy of the dataframe to not change the original dataframe
PandasTools.AddMoleculeColumnToFrame(susdat_nona, 'SMILES_orginal', 'ROMol') #Adds a ROMol column to the dataframe based on the SMILES column

susdat_nona = susdat_nona.dropna(subset = ['ROMol'])
susdat_nona['SMILES'] = susdat_nona.SMILES_orginal

susdat_nona.head()

In [None]:
salts = susdat_nona.SMILES.str.contains('\.') #Creates a boolean series with True for salts and False for non-salts

susdat_salts = susdat_nona[salts].copy() #Creates a new dataframe with salts
susdat_nona_nosalt = susdat_nona[~salts].copy() #Creates a new dataframe without salts

susdat_salts

In [None]:
#Remove salts from SMILES
def remove_ions(mol, ions):
    remover = SaltRemover.SaltRemover(defnData=ions)
    return remover.StripMol(mol)

parts2remove = ['[F,Cl,Br,I]', '[Na,Mg,K,Ca,Li,Ba]', '[O,N]', 'O=S(=O)(O)O', 'O=[N+]([O-])O',
                'F[P-](F)(F)(F)(F)F' 'O=S(=O)([O-])C(F)(F)F', 'F[B-](F)(F)F', 

                '[H+]', '[H-]',

                '[Co,Pd,Ni,Al,Sn,Zn,Cu,Hg,Ag,Mn,Fe,B,Cs,Mo,Ti,Rh,Ir,Pt,Ru,Au,Hf,Ta,Zr,Rb,Sb,Bi,Pb,Nb,V,Sr,In,Eu,Yb,W,Gd,Ce,Cd,Pr,Nd,Cr,La,Er,Ho,Dy,Y,Ge,Sm,Ga,Se]',
                
                'F[P-](F)(F)(F)(F)F', 'FB(F)F', 'F[As-](F)(F)(F)(F)F',

                'Cl[Zn-2](Cl)(Cl)Cl', 'Cl[Zn-](Cl)Cl',
                '[O-][Cl+3]([O-])([O-])[O-]', '[O-][Cl+3]([O-])([O-])O', 

                'OO', 'O=[V]','O[Al+2]', 'O[Al+]O','O=[Si]([O-])[O-]','O=[As](O)(O)O', 'O[As](O)O', 'O=[Se](=O)(O)O', 'O=[Mo-2](=O)(=O)=O', 'OB(O)O',
                'O=P(O)(O)O', 'O=PO', 'O=P(O)(O)OP(=O)(O)O',
                'O=S=O', 'NS(=O)(=O)[O-]', 'O=S([O-])[O-]', 'O=S(=O)=O',

                'NO', 'O=N[O-]',
                'NN'
                ] # additional ions to remove

parts2remove_2 = [ '[O-][I+3]([O-])([O-])[O-]',
                
                'C', 'CCCC',
                'CN', '[C-]#N', 'N#CS', 'N#C[S-]', 'C=N', 'C[NH3+]', 'CN(C)C', 'NCCN', 'CNC','NC(=S)S',
                'CO', 'O=CO', 'C=O', 'C[O-]', '[C-]#[O+]','O=C(O)O', 'O=C(O)C(=O)O', 'CCO', 'CC(=O)O',
                'NC(=O)O', 'N#C[O-]', 'CC(C)(N)CO', 'C1COCCN1', 
                'CI',

                'O=P([O-])([O-])OCO',
                'OC(O)C(Cl)(Cl)Cl',
                'O=C([O-])C(F)(F)F'
]
parts2remove_3 = ['OCCNCCO',
                  'CCNCC']

parts2remove_4 = ['NC(N)=O', 'CC(O)CN', 'CNCCO',
                  'C1CNCCN1', 'CCN', 'N=C(N)N',
                  'C=COC=C', 'CC(O)C(=O)O', 'O=C[O-]',
                  'COS(=O)(=O)[O-]', 'CS(=O)(=O)O', 'CCOS(=O)(=O)[O-]', 'Cc1ccc(S(=O)(=O)[O-])cc1',

                  'O=S(=O)([O-])C(F)(F)C(F)(F)C(F)(F)C(F)(F)F'
                  ]
parts2remove_5 = ['NCCO',
                  'C1CCC(NC2CCCCC2)CC1', 'C1CCNCC1',
                  'O=C(O)C=CC(=O)O'
                  ]
parts2remove_6 = ['O=C1CC[C@@H](C(=O)O)N1',
                  'O=C(O)c1ccccc1', 'O=C(O)CC(O)(CC(=O)O)C(=O)O',
                  'Cc1ccc(S(=O)(=O)O)cc1'
                  ]
parts2remove_7 = ['OCCN(CCO)CCO', 'O=C(O)C(O)C(O)C(=O)O', 'CN(C)CCO'
                  'NC1CCCCC1' ,'CC[NH+](CC)CC', 'NCCCCCCN', 'CCN(CC)CC'
                  ]
parts2remove_8 = ['CN(C)CCO', '[NH3+]C1CCCCC1',
                  'NC1CCCCC1','Nc1nc(N)nc(N)n1', 'c1ccncc1',
                  'O=C(O)c1ccccc1O', 'Oc1ccccc1', 'O=C(O)CCC(=O)O',
                  'O=S(=O)([O-])C(F)(F)F'
                  ]
parts2remove_9 = ['NCCCC[C@H](N)C(=O)O',
                  'CCCCCCCCC=CCCCCCCCC(=O)O',
                  'CCCN','Nc1ccccc1'
                  ]

list_of_ion_lists = [parts2remove, parts2remove_2, parts2remove_3, parts2remove_4, parts2remove_5, parts2remove_6, parts2remove_7, parts2remove_8, parts2remove_9]

def filter_smiles(smiles):
    '''
    Function to remove any duplicate components in the SMILES string.
    '''
    split_smiles = str(smiles).split('.')

    def all_equal(smiles_list):             # Check if all SMILES in split list are equal
        smiles_list = iter(smiles_list)
        try:
            first = next(smiles_list)
        except StopIteration:               #If list empty, return True
            return True
        return all(first == rest for rest in smiles_list)


    if all_equal(split_smiles) == True:     # If all SMILES are equal, return the first SMILES element
        return split_smiles[0]
    else:                                   # If not, return original SMILES
        return smiles


susdat_salts_cleaned = pd.DataFrame() #Create an empty dataframe to store the removed salts

for ion_list in list_of_ion_lists:
    for ion in ion_list:
        susdat_salts['ROMol'] = susdat_salts.ROMol.apply(lambda x: remove_ions(x, ion))
    
    susdat_salts['SMILES'] = susdat_salts.ROMol.apply(lambda x: Chem.MolToSmiles(x)) #Update SMILES column with new SMILES
    susdat_salts['SMILES'] = susdat_salts.SMILES.apply(lambda x: filter_smiles(x)) #Filter out salts with only same compounents from SMILES column
    # PandasTools.AddMoleculeColumnToFrame(susdat_salts, 'SMILES', 'ROMol')

    # uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
    # susdat_salts['ROMol'] = susdat_salts.ROMol.apply(lambda x: uncharger.uncharge(x))
    # susdat_salts.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
    # susdat_salts['SMILES'] = susdat_salts.ROMol.apply(lambda x: Chem.MolToSmiles(x))

    susdat_salts_removed = susdat_salts[~susdat_salts['SMILES'].str.contains('\.')].copy() #Create a new dataframe without salts
    susdat_salts = susdat_salts[susdat_salts['SMILES'].str.contains('\.')].copy()

    susdat_salts_cleaned = pd.concat([susdat_salts_cleaned, susdat_salts_removed], ignore_index=True)

susdat_nona_nosalt = pd.concat([susdat_nona_nosalt, susdat_salts_cleaned], ignore_index=True) #Concatenate the two dataframes

In [None]:
# parts2remove_2 = [ '[O-][I+3]([O-])([O-])[O-]', 'O=S(=O)=O',
                  
#                   'C', 'CCCC',
#                   'CN', '[C-]#N', 'N#CS', 'N#C[S-]', 'C=N', 'C[NH3+]', 'CN(C)C', 'CCNCC', 'NCCN', 'CNC','NC(=S)S',
#                   'CO', 'O=CO', 'C=O', 'C[O-]', '[C-]#[O+]','O=C(O)O', 'O=C(O)C(=O)O', 'CCO', 'O=C[O-]',
#                   'NC(=O)O', 'N#C[O-]', 'NCCO', 'NC(N)=O',

#                   'O=P([O-])([O-])OCO',
#                   'OC(O)C(Cl)(Cl)Cl',
#                   'O=C([O-])C(F)(F)F',
#                   'O=S(=O)([O-])C(F)(F)C(F)(F)C(F)(F)C(F)(F)F'
# ]

# for part in parts2remove:
#     susdat_salts['ROMol'] = susdat_salts.ROMol.apply(lambda x: remove_ions(x, part))

# susdat_salts['SMILES'] = susdat_salts.ROMol.apply(lambda x: Chem.MolToSmiles(x)) #Update SMILES column with new SMILES

In [None]:
susdat_nona_nosalt[susdat_nona_nosalt.SMILES=='']

In [None]:
susdat_salts[susdat_salts.SMILES=='']

In [None]:
susdat_salts

In [None]:
from itertools import zip_longest

salts = susdat_salts.copy() #Create a copy of the salts dataframe

salts['list_of_components'] = salts['SMILES'].apply(lambda x: x.split('.'))
salts_components = pd.DataFrame(zip_longest(*salts['list_of_components'])).T
salts_components['name'] = salts['name'].reset_index(drop=True)
salts_components['gc_probability'] = salts['gc_probability'].reset_index(drop=True)

duplicates_first_component = salts_components[salts_components.duplicated(subset=0, keep=False)]
duplicates_second_component = salts_components[salts_components.duplicated(subset=1, keep=False)]
duplicates_third_component = salts_components[salts_components.duplicated(subset=2, keep=False)]

duplicates_first_component.sort_values(by = 0, inplace = True)
duplicates_second_component.sort_values(by = 1, inplace = True)
duplicates_third_component.sort_values(by = 2, inplace = True)

In [None]:
duplicates_first_component

In [None]:
with open('salts_first_component.pkl', 'wb') as f:
    pickle.dump(duplicates_first_component, f)

In [None]:
duplicates_second_component

In [None]:
duplicates_third_component

In [None]:
# repl = susdat_nona[susdat_nona['ROMol'].isna()==True].SMILES.str.replace('BH', 'B-')

# susdat_nona.loc[susdat_nona['ROMol'].isna()==True, 'SMILES'] = repl

In [None]:
PandasTools.AddMoleculeColumnToFrame(susdat_salts, 'SMILES', 'ROMol')

uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
susdat_salts['ROMol'] = susdat_salts.ROMol.apply(lambda x: uncharger.uncharge(x))

def neutralize_atoms(mol):
    try:
        pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
        at_matches = mol.GetSubstructMatches(pattern)
        at_matches_list = [y[0] for y in at_matches]
        if len(at_matches_list) > 0:
            for at_idx in at_matches_list:
                atom = mol.GetAtomWithIdx(at_idx)
                chg = atom.GetFormalCharge()
                hcount = atom.GetTotalNumHs()
                atom.SetFormalCharge(0)
                atom.SetNumExplicitHs(hcount - chg)
                atom.UpdatePropertyCache()
        return mol
    except:
        return mol

#susdat_nona['ROMol'] = susdat_nona.ROMol.apply(lambda x: neutralize_atoms(x))
susdat_salts.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
susdat_salts['SMILES'] = susdat_salts.ROMol.apply(lambda x: Chem.MolToSmiles(x))

In [None]:
susdat_salts[susdat_salts.SMILES=='']

In [None]:
# #Split SMILES with multiple salts of interest and keep both salts in duplicated rows
# def keep_salts(df, list_of_compound_name):
    
#     df['split_SMILES'] = False #Create a column to check if the SMILES has been split
#     new_df = df #Create a new dataframe to store the updated dataframe

#     for compound_name in list_of_compound_name:
#         compound = df[df['name'] == compound_name].reset_index(drop=True) #Get the compound with the name
#         compound['split_SMILES'] = True #Update the split_SMILES column to True
    
#         split_smiles = compound['SMILES'][0].split('.')   #Split the salts in the SMILES column

#         compound = pd.DataFrame(np.repeat(compound.values, len(split_smiles), axis=0), columns=compound.columns) #Repeat the compound row for each salt
#         compound['SMILES'] = split_smiles #Update the SMILES column with the split SMILES

#         new_df = new_df[new_df['name'] != compound_name] #Create a new dataframe without the compound name
#         new_df = pd.concat([new_df, compound], ignore_index = True).reset_index(drop=True) #Concatenate the new dataframe with the compound dataframe

#     return new_df

# salts_to_keep = susdat_nona[susdat_nona.name.str.contains('(1:1)|(2:1)|(1:2)|(1/1)|(1:4)|copolymer')]['name']
# susdat_no_salt = keep_salts(susdat_nona, salts_to_keep)

In [None]:
# PandasTools.AddMoleculeColumnToFrame(susdat_no_salt, 'SMILES', 'ROMol')

# uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
# susdat_no_salt['ROMol'] = susdat_no_salt.ROMol.apply(lambda x: uncharger.uncharge(x))

# def neutralize_atoms(mol):
#     try:
#         pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
#         at_matches = mol.GetSubstructMatches(pattern)
#         at_matches_list = [y[0] for y in at_matches]
#         if len(at_matches_list) > 0:
#             for at_idx in at_matches_list:
#                 atom = mol.GetAtomWithIdx(at_idx)
#                 chg = atom.GetFormalCharge()
#                 hcount = atom.GetTotalNumHs()
#                 atom.SetFormalCharge(0)
#                 atom.SetNumExplicitHs(hcount - chg)
#                 atom.UpdatePropertyCache()
#         return mol
#     except:
#         return mol

# susdat_no_salt['ROMol'] = susdat_no_salt.ROMol.apply(lambda x: neutralize_atoms(x))
# susdat_no_salt.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
# susdat_no_salt['SMILES'] = susdat_no_salt.ROMol.apply(lambda x: Chem.MolToSmiles(x))

In [None]:
salts = susdat_no_salt[susdat_no_salt['SMILES'].str.contains(r'\.')]
salts

In [None]:
salts_components

In [None]:
uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
susdat_no_salt['ROMol'] = susdat_no_salt.ROMol.apply(lambda x: uncharger.uncharge(x))

def neutralize_atoms(mol):
    try:
        pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
        at_matches = mol.GetSubstructMatches(pattern)
        at_matches_list = [y[0] for y in at_matches]
        if len(at_matches_list) > 0:
            for at_idx in at_matches_list:
                atom = mol.GetAtomWithIdx(at_idx)
                chg = atom.GetFormalCharge()
                hcount = atom.GetTotalNumHs()
                atom.SetFormalCharge(0)
                atom.SetNumExplicitHs(hcount - chg)
                atom.UpdatePropertyCache()
        return mol
    except:
        return mol

susdat_no_salt['ROMol'] = susdat_no_salt.ROMol.apply(lambda x: neutralize_atoms(x))
susdat_no_salt.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
susdat_no_salt['SMILES'] = susdat_no_salt.ROMol.apply(lambda x: Chem.MolToSmiles(x))
susdat_no_salt['InChIKey'] = susdat_no_salt.ROMol.apply(lambda x: inchi.MolToInchiKey(x))
susdat_no_salt['InChIKey14'] = susdat_no_salt.InChIKey.apply(lambda x: x.split('-')[0])

In [None]:
susdat_no_salt = susdat_no_salt[susdat_no_salt['SMILES']!=''].reset_index(drop=True)
susdat_no_salt['duplicate_InChIKey'] = susdat_no_salt.duplicated(subset=['InChIKey'], keep=False)

In [None]:
susdat_no_salt[susdat_no_salt['SMILES'].str.contains(r'\.')]

In [None]:
with open('2025-02-12_susdat_std.pkl', 'wb') as f:
    pickle.dump(susdat_no_salt, f)

### Cleaning new KLARA ACES SMILES (resolved using CIRpy)

In [None]:
with open('klara_kemikum_smiles.pkl', 'rb') as f:
    aces = pickle.load(f)

In [None]:
aces.head()

In [None]:
aces_nona = aces.dropna(subset = ['SMILES']).reset_index(drop=True)

In [None]:
#Split SMILES with multiple salts of interest and keep both salts in duplicated rows
def keep_salts(df, list_of_compound_name:list):
    
    df['split_SMILES'] = False #Create a column to check if the SMILES has been split
    new_df = df #Create a new dataframe to store the updated dataframe

    for compound_name in list_of_compound_name:
        compound = df[df['name'] == compound_name].reset_index(drop=True) #Get the compound with the name
        compound['split_SMILES'] = True #Update the split_SMILES column to True
    
        split_smiles = compound['SMILES'].str.split('.')   #Split the salts in the SMILES column

        compound = pd.DataFrame(np.repeat(compound.values, len(split_smiles), axis=0), columns=compound.columns) #Repeat the compound row for each salt
        compound['SMILES'] = split_smiles #Update the SMILES column with the split SMILES

        new_df = new_df[new_df['name'] != compound_name] #Create a new dataframe without the compound name
        new_df = pd.concat([new_df, compound], ignore_index = True).reset_index(drop=True) #Concatenate the new dataframe with the compound dataframe

    return new_df

#names where to keep both salts
list_of_salts_to_keep= ['Ivermectin', 'Fungamin', 'Bufencarb', '2,4-D and 2,4,5-T octylesters' ,'2,4-D n-Butyl ester mixed with 2,4,5-T n-', 'CMIT/MIT', 'sn-Glycerol 3-fosfat bis(cyklohexylammonium)']
aces_no_salt = keep_salts(aces_nona, list_of_salts_to_keep)

In [None]:
PandasTools.AddMoleculeColumnToFrame(aces_no_salt, 'SMILES', 'ROMol')
aces_no_salt = aces_no_salt.dropna(subset = ['ROMol']) #lost 18 compounds in conversion from SMILES to ROMol

In [None]:
def contains_carbon(mol):
    try:
        atomic_no = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        return bool(6 in atomic_no)
    except:
        return np.nan

aces_no_salt['contains_carbon'] = aces_no_salt.ROMol.apply(lambda x: contains_carbon(x))
aces_no_salt = aces_no_salt[aces_no_salt.contains_carbon == True].reset_index(drop=True)

In [None]:
#Remove salts from SMILES

def remove_ions(mol, ions):
    remover = SaltRemover.SaltRemover(defnData=ions)
    return remover.StripMol(mol)

parts2remove = ['[F,Cl,Br,I]', '[Na,Mg,K,Ca,Li,Ba]', 'CC(=O)O', '[O,N]', 'CS(=O)(=O)O', 'O=S(=O)(O)O', 'O=[N+]([O-])O', 'O=S(=O)(O)CCO',
                'F[P-](F)(F)(F)(F)F' 'O=S(=O)([O-])C(F)(F)F', 'F[B-](F)(F)F', 
                '[H+]',
                '[H-]',
                '[Co,Pd,Ni,Al,Sn,Zn,Cu,Hg,Ag,Mn,Fe,B,Cs,Mo,Ti,Rh,Ir,Pt,Ru,Au,Hf,Ta,Zr,Rb,Sb,Bi,Pb,Nb,V,Sr,In,Eu,Yb]',
                'OO',
                'O=C([O-])C(=O)O',
                'O=C(O)[C@H](O)[C@@H](O)C(=O)O',
                'O=C(O)/C=C\C(=O)O',
                'O=P(O)(O)O',
                '[C-]#[O+]',
                'O=C(O)O',
                'O=C(O)C(=O)O',
                'C',
                'NCC(O)CC[C@H](N)C(=O)O',
                'Cc1ccc(S(=O)(=O)[O-])cc1',
                'CN(C)C',
                'Cc1ccc(S(=O)(=O)O)cc1',
                'c1ccccc1',
                'CCCC[N+](CCCC)(CCCC)CCCC',
                'Cc1ccc(S(=O)(=O)O)cc1',
                'C[O-]',
                'O=S(=O)=O',
                'F[P-](F)(F)(F)(F)F',
                'O=[V]',
                'CC(C)[O-]',
                'FB(F)F',
                'O=C(O)CC(O)(CC(=O)O)C(=O)O',
                'CCNCC',
                'CNC',
                'COS(=O)(=O)[O-]'] # additional ions to remove



for part in parts2remove:
    aces_no_salt['ROMol'] = aces_no_salt.ROMol.apply(lambda x: remove_ions(x, part))

aces_no_salt['SMILES'] = aces_no_salt.ROMol.apply(lambda x: Chem.MolToSmiles(x)) #Update SMILES column with new SMILES

#Remove rows with compounds that are not analysable
aces_no_salt = aces_no_salt[(aces_no_salt['name']!= 'Dowex 50 W X 8, 200-400 mesh, H(+)-form.')]


In [None]:
def filter_smiles(smiles):
    split_smiles = str(smiles).split('.')


    def all_equal(smiles_list):             # Check if all SMILES in split list are equal
        smiles_list = iter(smiles_list)
        try:
            first = next(smiles_list)
        except StopIteration:               #If list empty, return True
            return True
        return all(first == rest for rest in smiles_list)


    if all_equal(split_smiles) == True:     # If all SMILES are equal, return the first SMILES element
        return split_smiles[0]
    else:                                   # If not, return original SMILES
        return smiles
    
aces_no_salt['SMILES'] = aces_no_salt['SMILES'].apply(lambda x: filter_smiles(x)) #Filter out salts with same compounents from SMILES column
PandasTools.AddMoleculeColumnToFrame(aces_no_salt, 'SMILES', 'ROMol')

In [None]:
salts = aces_no_salt[aces_no_salt['SMILES'].str.contains(r'\.')]
salts

In [None]:
uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
aces_no_salt['ROMol'] = aces_no_salt.ROMol.apply(lambda x: uncharger.uncharge(x))

def neutralize_atoms(mol):
    try:
        pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
        at_matches = mol.GetSubstructMatches(pattern)
        at_matches_list = [y[0] for y in at_matches]
        if len(at_matches_list) > 0:
            for at_idx in at_matches_list:
                atom = mol.GetAtomWithIdx(at_idx)
                chg = atom.GetFormalCharge()
                hcount = atom.GetTotalNumHs()
                atom.SetFormalCharge(0)
                atom.SetNumExplicitHs(hcount - chg)
                atom.UpdatePropertyCache()
        return mol
    except:
        return mol

aces_no_salt['ROMol'] = aces_no_salt.ROMol.apply(lambda x: neutralize_atoms(x))
aces_no_salt.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
aces_no_salt['SMILES'] = aces_no_salt.ROMol.apply(lambda x: Chem.MolToSmiles(x))
aces_no_salt['InChIKey'] = aces_no_salt.ROMol.apply(lambda x: inchi.MolToInchiKey(x))
aces_no_salt['InChIKey14'] = aces_no_salt.InChIKey.apply(lambda x: x.split('-')[0])

In [None]:
aces_no_salt = aces_no_salt[aces_no_salt['SMILES']!=''].reset_index(drop=True)
aces_no_salt['duplicate_InChIKey'] = aces_no_salt.duplicated(subset=['InChIKey'], keep=False)

In [None]:
with open('2025-02-11_klara_aces_std.pkl', 'wb') as f:
    pickle.dump(aces_no_salt, f)

### Cleaning KLARA Kemikum smiles (resolved using CIRpy)

In [None]:
with open('2025-03-05_kemikum_resolved_cas.pkl', 'rb') as f:
    kemikum = pickle.load(f)

kemikum.head()

In [None]:
kemikum_nona = kemikum.dropna(subset='SMILES').reset_index(drop=True)

In [None]:
PandasTools.AddMoleculeColumnToFrame(kemikum_nona, 'SMILES', 'ROMol')
kemikum_nona_nona_ROMol = kemikum_nona.dropna(subset = ['ROMol'])

In [None]:
kemikum_nona_nona_ROMol.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
kemikum_nona_nona_ROMol['SMILES'] = kemikum_nona_nona_ROMol.ROMol.apply(lambda x: Chem.MolToSmiles(x))

In [None]:
#Remove any compounds which do not contain carbon

def contains_carbon(mol):
    try:
        atomic_no = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        return bool(6 in atomic_no)
    except:
        return np.nan

kemikum_nona_nona_ROMol['contains_carbon'] = kemikum_nona_nona_ROMol.ROMol.apply(lambda x: contains_carbon(x))
kemikum_nona_nona_ROMol = kemikum_nona_nona_ROMol[kemikum_nona_nona_ROMol.contains_carbon == True].reset_index(drop=True)

In [None]:
#Remove salts from SMILES

def remove_ions(mol, ions):
    remover = SaltRemover.SaltRemover(defnData=ions)
    return remover.StripMol(mol)

parts2remove = ['[F,Cl,Br,I]', '[Na,Mg,K,Ca,Li,Ba]', 'CC(=O)O', '[O,N]', 'CS(=O)(=O)O', 'O=S(=O)(O)O', 'O=[N+]([O-])O', 'O=S(=O)(O)CCO',
                'F[P-](F)(F)(F)(F)F' 'O=S(=O)([O-])C(F)(F)F', 'F[B-](F)(F)F', 
                '[H+]',
                '[H-]',
                '[Co,Pd,Ni,Al,Sn,Zn,Cu,Hg,Ag,Mn,Fe,B,Cs,Mo,Ti,Rh,Ir,Pt,Ru,Au,Hf,Ta,Zr,Rb,Sb,Bi,Pb,Nb,V,Sr,In,Eu,Yb,La]',
                'OO',
                'O=C([O-])C(=O)O',
                'O=C(O)[C@H](O)[C@@H](O)C(=O)O',
                'O=C(O)/C=C\C(=O)O',
                'O=P(O)(O)O',
                '[C-]#[O+]',
                'O=C(O)O',
                'O=C(O)C(=O)O',
                'C',
                'NCC(O)CC[C@H](N)C(=O)O',
                'Cc1ccc(S(=O)(=O)[O-])cc1',
                'CN(C)C',
                'Cc1ccc(S(=O)(=O)O)cc1',
                'c1ccccc1',
                'CCCC[N+](CCCC)(CCCC)CCCC',
                'Cc1ccc(S(=O)(=O)O)cc1',
                'C[O-]',
                'O=S(=O)=O',
                'F[P-](F)(F)(F)(F)F',
                'O=[V]',
                'CC(C)[O-]',
                'FB(F)F',
                'C=Cc1ccccc1C=C',
                'O=C([O-])c1ccccc1',
                'NCCN',
                'C=C',
                'N=C(N)N',
                'O=C(O)CCC(=O)O'] # additional ions to remove


for part in parts2remove:
    kemikum_nona_nona_ROMol['ROMol'] = kemikum_nona_nona_ROMol.ROMol.apply(lambda x: remove_ions(x, part))

kemikum_nona_nona_ROMol['SMILES'] = kemikum_nona_nona_ROMol.ROMol.apply(lambda x: Chem.MolToSmiles(x)) #Update SMILES column with new SMILES

In [None]:
compounds_to_remove = ['Boron trifluoride methanol complex solution (13-', 
                       'Nafion perfluorinated membrane', 
                       'Bis(cyclopentadienyl)cobalt(III) hexafluorophosphate', 
                       'Poly(vinylalkohol-co-etylen)', 
                       "Merrifield's peptide resin (Sigma-Aldrich", 
                       'Poly(etylen-co-akrylsyra) 15 wt. % akrylsyra', 
                       'Poly(4-vinylpyridine), cross-linked, Reillex 425 ion-', 
                       'Amberlite IRA743 free base', 
                       'Poly(ethylene glycol)-block-poly(propylene glycol)-', 
                       'Araldite 506 epoxy resin (Sigma-Aldrich', 
                       'Dowex Monosphere 77 free base', 
                       'Poly(ethylene-alt-maleic anhydride) (Sigma-', 
                       'Amberlite IR120 Na+ form', 
                       "Merrifield's peptide resin (Sigma-Aldrich ", 
                       'Dowex 1X4 chloride form', 
                       'Poly(maleic anhydride-alt-1-octadecene)', 
                       'Araldite 506 epoxy resin (Sigma-Aldrich ',
                       'Araldite 506 epoxy resin (Sigma-Aldrich <br>A3183)'
                       ]

kemikum_nona_nona_ROMol = kemikum_nona_nona_ROMol[~kemikum_nona_nona_ROMol['name'].isin(compounds_to_remove)].reset_index(drop=True)

In [None]:
def filter_salts(df, list_of_compounds_to_split):

    def remove_duplicate_smiles_in_salt(smiles):
        
        '''
        Determine if salts consists of multiple equal components and then remove any duplicate smiles 
        '''
        
        split_smiles = str(smiles).split('.')

        def all_equal(smiles_list):             # Check if all SMILES in split list are equal
            '''
            Determine if all SMILES in split list are equal
            '''
            smiles_list = iter(smiles_list)
            try:
                first = next(smiles_list)
            except StopIteration:               #If list empty, return True
                return True
            return all(first == rest for rest in smiles_list)
        
        if all_equal(split_smiles) == True:     # If all SMILES are equal, return the first SMILES element
            return split_smiles[0]
        else:                                   # If not, return original SMILES
            return smiles

    def split_salts(df, list_of_compound_name:list): 

        '''
        Divide salts from specified list into its components and keep both salts in duplicated rows
        '''
        
        df['split_SMILES'] = False #Create a column to check if the SMILES has been split
        new_df = df #Create a new dataframe to store the updated dataframe

        for compound_name in list_of_compound_name:
            if compound_name in df['name'].values:
                compound = df[df['name'] == compound_name].reset_index(drop=True) #Get the compound with the name
                compound['split_SMILES'] = True #Update the split_SMILES column to True
            
                split_smiles = compound['SMILES'][0].split('.')   #Split the salts in the SMILES column

                compound = pd.DataFrame(np.repeat(compound.values, len(split_smiles), axis=0), columns=compound.columns) #Repeat the compound row for each salt
                compound['SMILES'] = split_smiles #Update the SMILES column with the split SMILES

                new_df = new_df[new_df['name'] != compound_name] #Create a new dataframe without the compound name
                new_df = pd.concat([new_df, compound], ignore_index = True).reset_index(drop=True) #Concatenate the new dataframe with the compound dataframe
            else:
                pass

        return new_df
    
    df['SMILES'] = df['SMILES'].apply(lambda x: remove_duplicate_smiles_in_salt(x)) #Filter out salts with same compounents from SMILES column
    df_no_salt = split_salts(df, list_of_compounds_to_split) #Split salts into components and keep both salts in duplicated rows

    return df_no_salt

#names where to keep both salts
list_of_salts_to_keep= ['Abamectin', 
                        'Ivermectin',
                        'Emamectin-bensoat', 
                        'Amberlyst 15 hydrogen form', 
                        'Dowex M-31 hydrogen form', 
                        'Gentamicin sulfate salt',
                        'Polyethylenimine, branched', 
                        'Guanidintiocyanat', 
                        'Hexadimetrinbromid ',
                        'Amberlite IRA-67 free base', 
                        'Serotonin creatinine sulfate monohydrate', 
                        'Kinhydron',
                        'N-metylanilintrifluoracetat',
                        'N-Methylaniline trifluoroacetate',
                        'Poly(ethylene glycol)-block-poly(propylene glycol)-<br>block-poly(ethylene glycol)',
                        'Quinhydrone',
                        'Hexadimethrine bromide'
                        ]

kemikum_no_salt = filter_salts(kemikum_nona_nona_ROMol, list_of_salts_to_keep)
PandasTools.AddMoleculeColumnToFrame(kemikum_no_salt, 'SMILES', 'ROMol')

In [None]:
uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
kemikum_no_salt['ROMol'] = kemikum_no_salt.ROMol.apply(lambda x: uncharger.uncharge(x))

def neutralize_atoms(mol):
    try:
        pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
        at_matches = mol.GetSubstructMatches(pattern)
        at_matches_list = [y[0] for y in at_matches]
        if len(at_matches_list) > 0:
            for at_idx in at_matches_list:
                atom = mol.GetAtomWithIdx(at_idx)
                chg = atom.GetFormalCharge()
                hcount = atom.GetTotalNumHs()
                atom.SetFormalCharge(0)
                atom.SetNumExplicitHs(hcount - chg)
                atom.UpdatePropertyCache()
        return mol
    except:
        return mol

kemikum_no_salt['ROMol'] = kemikum_no_salt.ROMol.apply(lambda x: neutralize_atoms(x))
kemikum_no_salt.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
kemikum_no_salt['SMILES'] = kemikum_no_salt.ROMol.apply(lambda x: Chem.MolToSmiles(x))
kemikum_no_salt['InChIKey'] = kemikum_no_salt.ROMol.apply(lambda x: inchi.MolToInchiKey(x))
kemikum_no_salt['InChIKey14'] = kemikum_no_salt.InChIKey.apply(lambda x: x.split('-')[0])

In [None]:
kemikum_no_salt[kemikum_no_salt['SMILES'].str.contains(r'\.')] #Should not have any entries

In [None]:
kemikum_no_salt = kemikum_no_salt[kemikum_no_salt['SMILES']!=''].reset_index(drop=True)
kemikum_no_salt['duplicate_InChIKey'] = kemikum_no_salt.duplicated(subset=['InChIKey'], keep=False)

In [None]:
with open('2025-03-06_KLARA_Kemikum_UPDATED_std.pkl', 'wb') as f:
    pickle.dump(kemikum_no_salt, f)

### Update Isabels list to include two new compounds from KLARA Kemikum

In [None]:
with open('2025-02-12_KLARA_Kemikum_std.pkl', 'rb') as f:
    kemikum_no_salt = pickle.load(f)

In [None]:
isabelle_std

In [None]:
rows_to_append_to_isabel_data = kemikum_no_salt[kemikum_no_salt.name == '2,4-Dinitroaniline']
rows_to_append_to_isabel_data = rows_to_append_to_isabel_data.rename(columns={'name':'Compound',
                                                                              'cas':'CAS'})
print(rows_to_append_to_isabel_data)

In [None]:
isabelle_std_updated = pd.concat([isabelle_std, rows_to_append_to_isabel_data], ignore_index = True)
isabelle_std_updated

In [None]:
isabelle_std_updated.to_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Isabelles_chemicals_STD_updated.csv', index = False, sep = '\t')