## Here the mixes and all extra properties of the compounds of interest be found/determined

In [None]:
import pickle
import pandas as pd 
import numpy as np

from rdkit import Chem
from rdkit.Chem import PandasTools, Descriptors, rdMolDescriptors, Crippen, Fragments

In [None]:
with open('2025-04-15_klara_available_actives.pkl', 'rb') as f:
    all_available_actives = pickle.load(f)

In [None]:
all_available_actives

## Updating of the available compounds, and addition of properties to help analysis
During the making of the standards some thing may have happened leading to some compounds not going all the way to analysis. 

These will be removed using a filter which consists of all compound which had various issues throughout the standard making process, as well as a comment next to the name as to why. 

In [None]:
#Compounds which couldn't be analysed for various reasons (fond in comments beside the compound name)

compounds_to_remove = ['Toluylene diisocyanate (mixutre of isomeres) <br>(mass)', # not appropriate for MS analysis, was also prone to polymerization
                       "N,N'-Dicyklohexylkarbodiimid", # Reacts with water, determined to not be appropriate to work with
                       'Folpet',#Not found
                       '1,2,5,6,9,10-Hexabromocyclododecane', #Not found
                       '4-Phenoxyphenol', #Not found
                       'Lindane', #Not found
                       '4-(Methylamino)phenol hemisulfate salt', #Not found
                       'p-Toluidin', #Too crystalized in packaging, could not be transferred
                       'Aminoguanidine bicarbonate', # Could not be dissolved in anything other that water  
                       '1,2,4-Triazole', #Not found
                       '1,2,4-Triazole sodium derivative', #Not found
                       'beta-Phenylcinnamaldehyde', #Not found
                       'Triton X-100 (Sigma-Aldrich Sweden AB)', #Not suitable for GC-analysis
                       'Aniline (volume)' #Not found
                       ] 

all_available_actives_updated = all_available_actives[~all_available_actives['name'].isin(compounds_to_remove)].reset_index(drop=True)

In [None]:
def calc_molecular_formula_and_mol_weight(df):
    '''
    This function calculates the following chemical characteristics:
         molecular formula
         monoisotopic molecular weight
         LogP
         number of amines
         number of hydroxyls
         number of hydrogen bond acceptors
         number of hydrogen bond donors
    '''

    PandasTools.AddMoleculeColumnToFrame(df, smilesCol='SMILES')
    df['monoisotopic_molecular_weight'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcExactMolWt)
    df['molecular_formula'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcMolFormula)
    df['logP'] = df['ROMol'].apply(Chem.Crippen.MolLogP)

    prim_amines = df['ROMol'].apply(Chem.Fragments.fr_NH2)
    sec_amines = df['ROMol'].apply(Chem.Fragments.fr_NH1)
    tert_amines = df['ROMol'].apply(Chem.Fragments.fr_NH0)
    arom_amines = df['ROMol'].apply(Chem.Fragments.fr_Ar_NH)
    df['amines'] = prim_amines + sec_amines + tert_amines + arom_amines

    aliph_hydroxyls = df['ROMol'].apply(Chem.Fragments.fr_Al_OH)
    aromatic_hydroxyls = df['ROMol'].apply(Chem.Fragments.fr_Ar_OH)
    df['hydroxyls'] = aliph_hydroxyls + aromatic_hydroxyls

    df['Chlorines'] = df['ROMol'].apply(lambda x: sum(1 for atom in x.GetAtoms() if atom.GetSymbol() == 'Cl')) #made with the help of ChatGPT
    df['Bromines'] = df['ROMol'].apply(lambda x: sum(1 for atom in x.GetAtoms() if atom.GetSymbol() == 'Br')) #made with the help of ChatGPT

    df['HBA'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcNumLipinskiHBA)
    df['HBD'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcNumLipinskiHBD)
    return df

all_available_actives_updated = calc_molecular_formula_and_mol_weight(all_available_actives_updated)

all_available_actives_updated.head()

In [None]:
available_actives_w_same_mol_formula = all_available_actives_updated[all_available_actives_updated.duplicated(subset='molecular_formula', keep=False)].reset_index(drop=True).sort_values(by='molecular_formula')

available_actives_w_same_mol_formula[['name', 'molecular_formula', 'section']]

In [None]:
import pubchempy as pcp

def get_pubchem_cid(df):
    '''
    This function gets the PubChem cid for the compounds in the dataframe
    '''
    pubchem_data = []
    for index, row in df.iterrows():
        try:
            compound = pcp.get_compounds(row['InChIKey'], 'inchikey')[0].to_dict(properties=['cid'])['cid']
            pubchem_data.append(compound)
            print(f'Got data for compound: {row['InChIKey']} \n cid: {compound}')
        except Exception as e:
            print(f'Failed to get data for compound: {row['InChIKey']} \n error: {e}')
            pubchem_data.append(None)

    df['pubchem_cid'] = pubchem_data
    return df

all_available_actives_updated = get_pubchem_cid(all_available_actives_updated)

In [None]:
all_available_actives_updated_pugview_fail = all_available_actives_updated[all_available_actives_updated['pubchem_cid'].isna()]
all_available_actives_updated_pugview_success = all_available_actives_updated[~all_available_actives_updated['pubchem_cid'].isna()]

all_available_actives_updated_pugview_fail = get_pubchem_cid(all_available_actives_updated_pugview_fail)

In [None]:
all_available_actives_updated = pd.concat([all_available_actives_updated_pugview_success, all_available_actives_updated_pugview_fail], ignore_index=True)


In [None]:
all_available_actives_updated

In [None]:
import numpy as np
import requests

#get spectral and experimental data from PubChem
def get_pubchem_data(cid):
    '''
    Get information on spectral data from PubChem
    '''

    def get_spectral_data(cid):
        '''
        Get information on spectral data from PubChem
        '''
        # Get the PubChem CID for the compound
        gcms = False
        lcms = False
        
        for subsection in section:
            if subsection.get('TOCHeading') == 'Spectral Information':
                spectral_info = subsection.get('Section')
                for subsection in spectral_info:
                    if subsection.get('TOCHeading') == 'Mass Spectrometry':
                        mass_spec = subsection.get('Section')
                        for subsection in mass_spec:
                            if subsection.get('TOCHeading') == 'GC-MS':
                                gcms = True
                            elif subsection.get('TOCHeading') == 'LC-MS':
                                lcms = True 

        return gcms, lcms
    
    def get_experimental_data(cid):
        '''
        Get experimental data from PubChem
        '''
        # Get the PubChem CID for the compound
        bp_matched = []
        bp_unmatched = []
        vp = []
        sol = []
        for subsection in section:
            if subsection.get('TOCHeading') == 'Chemical and Physical Properties':
                chemical_props = subsection.get('Section')

                for subsection in chemical_props:
                    if subsection.get('TOCHeading') == 'Experimental Properties':
                        experimental_props = subsection.get('Section')
                        
                        for subsection in experimental_props:
                            if subsection.get('TOCHeading') == 'Boiling Point':
                                # Extract boiling point
                                bp_info = subsection.get('Information')
                                for ref in bp_info:
                                    if any('ExtendedReference' in k for k in ref):
                                        if any('Matched' in k for k in ref.get('ExtendedReference')[0]): # Requires Matching to library to be true
                                            if 'Number' in ref.get('Value').keys():
                                                value = ref.get('Value').get('Number')[0]
                                                if 'Unit' in ref.get('Value').keys():
                                                    unit = ref.get('Value').get('Unit')[0]
                                                    bp_matched.append(f"{value} {unit}")
                                                else:
                                                    bp_matched.append(str(value))
                                            else:
                                                bp_matched.append(str(ref.get('Value').get('StringWithMarkup')[0].get('String')))
                                    elif 'Number' in ref.get('Value').keys():
                                        value = ref.get('Value').get('Number')[0]
                                        if 'Unit' in ref.get('Value').keys():
                                            unit = ref.get('Value').get('Unit')[0]
                                            bp_unmatched.append(f"{value} {unit}")
                                        else:
                                            bp_unmatched.append(str(value))
                                    else:
                                        bp_unmatched.append(str(ref.get('Value').get('StringWithMarkup')[0].get('String')))
                                        
                            elif subsection.get('TOCHeading') == 'Vapor Pressure':
                                # Extract vapor pressure
                                vp_info = subsection.get('Information')
                                for ref in vp_info:
                                    if any('ExtendedReference' in k for k in ref):
                                        if any('Matched' in k for k in ref.get('ExtendedReference')[0]):
                                            if 'Number' in ref.get('Value').keys():
                                                value = ref.get('Value').get('Number')[0]
                                                if 'Unit' in ref.get('Value').keys():
                                                    unit = ref.get('Value').get('Unit')[0]
                                                    vp.append(f"{value} {unit}")
                                                else:
                                                    vp.append(str(value))
                                            else:
                                                vp.append(str(ref.get('Value').get('StringWithMarkup')[0].get('String')))

                            elif subsection.get('TOCHeading') == 'Solubility':
                                # Extract solubility
                                sol_info = subsection.get('Information')
                                for ref in sol_info:
                                    if 'Number' in ref.get('Value').keys():
                                        value = ref.get('Value').get('Number')[0]
                                        if 'Unit' in ref.get('Value').keys():
                                            unit = ref.get('Value').get('Unit')[0]
                                            sol.append(f"{value} {unit}")
                                        else:
                                            sol.append(str(value))
                                    else:
                                        sol.append(str(ref.get('Value').get('StringWithMarkup')[0].get('String')))                    
                            
        return bp_matched, bp_unmatched, vp, sol
        

    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON"
    
    try:
        response = requests.get(url).json()

        section = response.get('Record').get('Section')

        # Check if the JSON contains spectral information
        gcms, lcms = get_spectral_data(section)
        # Check if the JSON contains experimental data
        bp_matched, bp_unmatched, vp, sol = get_experimental_data(section)

        print(f"Information fetched for CID {cid}.")

        return gcms, lcms, bp_matched, bp_unmatched, vp, sol
    
    except Exception as e:
        print(f"Error fetching data for CID {cid}: {e}")
        return np.nan, np.nan, np.nan, np.nan, np.nan

# Get the PubChem data for all available actives
available_actives_information = pd.DataFrame(columns=all_available_actives_updated.columns.tolist()+['gcms_spectra_available', 'lcms_spectra_available', 'boiling_point_peer_reviewed', 'boiling_point', 'vapor_pressure', 'solubility'])

for index in all_available_actives_updated.index:
    compound = pd.DataFrame(all_available_actives_updated.loc[index,:].copy()).T
    
    gcms, lcms, bp_matched, bp_unmatched, vp, sol = get_pubchem_data(int(compound.loc[index, 'pubchem_cid']))
    df_pubchem_info = pd.DataFrame([[gcms, lcms, bp_matched, bp_unmatched, vp, sol]], 
                                    columns=['gcms_spectra_available', 'lcms_spectra_available', 'boiling_point_peer_reviewed', 'boiling_point', 'vapor_pressure', 'solubility'],
                                    index=compound.index)
    
    df_combined_info = pd.concat([compound, df_pubchem_info], axis=1)

    available_actives_information = pd.concat([available_actives_information, df_combined_info], axis=0)

    print(f'Information fetched for compound {index+1} of {len(all_available_actives_updated)}.')


#all_available_actives_updated[['gcms_spectra_available', 'lcms_spectra_available', 'boiling_point_peer_reviewed', 'boiling_point', 'vapor_pressure', 'solubility']] = all_available_actives_updated['pubchem_cid'].apply(get_pubchem_data).apply(pd.Series)

In [None]:
available_actives_information

## Making of the mixes
For each mix, a list of the compound names available on klara is made. These are then used to filter out the already used compounds from the 'all_available_actives' df, while making a new df for each mix, for easy acess to the information. 

In [None]:
mix1_list = ['4-Chlorophenyl isocyanate', '2-Chloroacetophenone', 'alpha-Tetralone (volume)', 'cis-Stilbene (mass)', 'Triphenylborane', 'Indene (mass)', 'Ftaldialdehyd ', 'p-Chloranil']

mix2_list = ['N-Phenyl-o-phenylenediamine', '3-(Dimethylamino)-phenol', 'N,N-Dimethyl-p-phenylenediamine', '2-Nitrophenylacetonitrile', '1,3-Phenylenediamine', 'Benzhydrazide', '2,4,6-Trichlorophenol', 'N,N-Dimethyl-p-toluidine (mass)',
             '2,3-Diaminotoluene', '1-Naphthol', 'Thiourea', 'Myristyltrimethylammonium bromide', 'Hexadecyltrimetylammoniumbromid', 'N,N-Diethyl-1,4-phenylenediammonium sulfate']

mix3_list = ['Tetramethylthiuram disulfide', 'Parathion-methyl', '5-Nitroacenaphthene', '2-Nitrofluorene', '6-Nitroquinoline', '1-Nitronaphthalene', 'Quinoline Yellow', 
             'N-Cyclohexylbenzothiazole-2-sulphenamide', 'N-tert-Butyl-2-benzothiazolesulfenamide', '4-Chloro-m-phenylenediamine']

mix4_list = ['9,10-Dihydrobenzo[a]pyrene-7(8H)-one', '8-Nitroquinoline', '1,2:3,4-Dibenzanthracene','3-Aminofluoranthene',  '1-Methylpyrene',
             '9-Anthracenemethanol', 'Anthrone', '2-Amino-4-methylphenol']

mix5_list = ['2-Methylanthraquinone', 'p-Anisidine (Sigma-Aldrich 800458)', 'N,N-Dimethyl-4-nitrosoaniline', '1-(2-Chlorophenyl)-1-(4-chlorophenyl)-2,2-<br>dichloroethane',
             "4,4'-Dihydroxybiphenyl" ]

mix6_list = ['4-Methyl-1,2-phenylendiamine', '3-Diethylaminophenol','2-Methoxy-5-methylaniline', '2,6-Diaminopyridine','N-Phenylhydroxylamine','3-Pyridinecarbonitrile',
             'm-Toluidine (volume) (Sigma-Aldrich 808314)','9-Anthracenecarbonitrile','2,4,5-Trichlorophenol','1-Fluoro-2,4-dinitrobenzene (volume)','2-Nitro-1-phenylpropene',
             '2,4-Diamino-6-fenyl-S-triazin', 'Perthan','2-Aminobenzothiazole']

mix7_list = ['4-Octylphenol', '(3-Chloropropyl)triethoxysilane', "2,2':5',2''-Terthiophene", '2-Acetylfluorene', '2,3,6-Triklorfenol', '2,4-Bis(a,a-dimetylbensyl)fenol', '2,5-Diklorfenol', '4-Butylphenol (massa)',
             'Allyl 2,4,6-tribromophenyl ether', 'Bis(4-bromophenyl) ether', 'Tetrasul', 'Cedrol', 'trans-Chlordane']

mix8_list = ['1,3,4-Thiadiazole-2,5-dithiol', 'Phenylhydrazine (volume)', 'Diaminomaleonitrile', "4,4'-DDD",'2-Ethylanthraquinone','Bromophos-ethyl','Cresyl diphenyl phosphate','Heptaklor','Heptaklorepoxid',
             'Metoxiklor','alpha-Chlordane']

mix9_list = ['4-Benzylphenol','4-Methyl-2,6-dinitrophenol','4-Nitro-1,2-phynelenediamine','4-Ethoxyaniline (mass)','3,4-Dimethylaniline','2-tert-Butyl-4-ethylphenol (mass)','2,3,5-Trimethylphenol',
             '2-tert-Butyl-5-methylphenol (volume)','m-Terphenyl']

mix10_list = ['4-Chloro-3-methylphenol','Triphenylethylene','3,5-Dichlorophenol','2,3,4-Trichlorophenol','Iso E Super (massa)','Tris(dimethylphenyl)-phosphate',"o,p'-DDT"]

mix11_list = ['2,4-Dimethylaniline (mass)','Phenyl isothiocyanate (mass)']

mixes_list = [mix1_list, mix2_list, mix3_list, mix4_list, mix5_list, mix6_list, mix7_list, mix8_list, mix9_list, mix10_list, mix11_list]

In [None]:
def separate_mixes(df, mix_list):

    '''
    Separate the chemicals into different groups based on the group names provided, 
    returns a dictionary with group name as key, and the chemical-df as value
    '''

    new_df = df.copy()
    mixes_dict = {}
    nr = 1

    for mix_nr in mix_list:
        mix_df = new_df[new_df['name'].isin(mix_nr)].reset_index(drop=True)
        new_df = new_df[~new_df['name'].isin(mix_nr)].reset_index(drop=True)
        mix_df['mixnr'] = nr
        
        mix_df = mix_df.sort_values(by='monoisotopic_molecular_weight', ascending=True)

        mixes_dict['mix'+str(nr)] = mix_df
        nr += 1


    return new_df, mixes_dict

all_available_actives_wo_mix, active_mixes_dict = separate_mixes(available_actives_information, mixes_list)
active_mixes_dict.keys() #Check that all mixes are included

In [None]:
all_available_actives_updated.to_csv('2025-03-26_endocrine_tox_active_chemicals.tsv', sep='\t', index=False)

### Making dataframe with all compounds which were added to mixes

In [None]:
df_actives = pd.DataFrame()
for mix in active_mixes_dict.keys():
    mix_df = active_mixes_dict[mix]
    df_actives = pd.concat([df_actives, mix_df], axis=0)

In [None]:
df_actives = df_actives.reset_index(drop=True)
df_actives

In [None]:
with open('2025-04-25_experimental_analysis_compounds.pkl', 'wb') as f:
    pickle.dump(df_actives, f)

In [None]:
df_actives_refurnitured = df_actives[['name', 'monoisotopic_molecular_weight', 'mixnr', 'nr.ahr', 'sr.mmp', 'SMILES', 'ROMol', 'logP', 'boiling_point_peer_reviewed', 'boiling_point', 'vapor_pressure']]
df_actives_refurnitured.head()

In [None]:
df_actives_refurnitured[['name', 'monoisotopic_molecular_weight', 'mixnr', 'logP']].to_excel('2025-05-07_Ellinors_mixes.xlsx')

# Get the structures for the compounds which ionized and didn't

In [None]:
import pandas as pd
import pickle

import rdkit
from rdkit import Chem
from rdkit.Chem import PandasTools

In [None]:
results = pd.read_excel('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Experimental/Experimental_work/2025-04-25_active_mixes_ms.xlsx')

In [None]:
results

In [None]:
results = results[['name', 'monoisotopic_molecular_weight', 'logP' ,'Found with UNIFI', 'Comment UNIFI', '[M+H]+', '[M]+', 'SMILES', 'nr.ahr', 'sr.mmp']]
results_cleaned = results.iloc[0:-2, :]

In [None]:
results_cleaned.sort_values(by='monoisotopic_molecular_weight', ascending=True, inplace=True)

In [None]:
results_cleaned.name = results_cleaned.name.str.replace(' (mass)', '', regex=False)
results_cleaned.name = results_cleaned.name.str.replace(' (massa)', '', regex=False)
results_cleaned.name = results_cleaned.name.str.replace(' (volume)', '', regex=False)
results_cleaned.name = results_cleaned.name.str.replace(' (Sigma-Aldrich 800458)', '', regex=False)
results_cleaned.name = results_cleaned.name.str.replace(' (Sigma-Aldrich 808314)', '', regex=False)
results_cleaned.name = results_cleaned.name.str.replace('<br>', '', regex=False)

In [None]:
PandasTools.AddMoleculeColumnToFrame(results_cleaned,'SMILES','ROMol')

In [None]:
results_not_ionized = results_cleaned[results_cleaned['Found with UNIFI'] == 0]

In [None]:
results_not_ionized

In [None]:
results_m = results_cleaned[(results_cleaned['[M]+']==1)&(results_cleaned['[M+H]+']==0)]
results_mh = results_cleaned[(results_cleaned['[M+H]+']==1)&(results_cleaned['[M]+']==0)]
results_both = results_cleaned[(results_cleaned['[M]+']==1)&(results_cleaned['[M+H]+']==1)]

In [None]:

results_m

In [None]:
results_mh

In [None]:
results_both

In [None]:
from rdkit.Chem import Draw
def draw_molecules(df, mol_per_row=3, sub_img_size=(200, 200)):
    '''
    Draws the molecules in the dataframe
    '''
    img = Draw.MolsToGridImage(
    df.ROMol.tolist(),
    molsPerRow=mol_per_row,        # Number of molecules per row
    subImgSize=sub_img_size,# Size of each individual image
    legends=df.name.tolist(),  # Optional: labels
    returnPNG=False
)
    return img

In [None]:
%pip install opencv-python

In [None]:
from PIL import Image

In [None]:
img_m = draw_molecules(results_m, mol_per_row=4, sub_img_size=(600, 300))

img_m.save('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Visualizations/Molecules/2025-05-24_ionized_M.pdf',
            quality=95, optimize=True, progressive=True,
            transparency=True, dpi=(300, 300))

img_m

In [None]:
img_mh = draw_molecules(results_mh, mol_per_row=4, sub_img_size=(600, 300))

img_mh

img_mh.save('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Visualizations/Molecules/2025-05-24_ionized_MH.png',
            quality=95, optimize=True, progressive=True,
            transparency=True, dpi=(300, 300))
img_mh

In [None]:
img_both = draw_molecules(results_both, mol_per_row=4, sub_img_size=(600, 300))

img_both.save('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Visualizations/Molecules/2025-05-24_ionized_both.png',
            quality=95, optimize=True, progressive=True,
            transparency=True, dpi=(300, 300))

img_both

In [None]:
img_no = draw_molecules(results_not_ionized, mol_per_row=4, sub_img_size=(600, 300))

img_no.save('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Visualizations/Molecules/2025-05-24_not_ionized.png',
            quality=95, optimize=True, progressive=True,
            transparency=True, dpi=(300, 300))

img_no

In [None]:
results_not_ionized.monoisotopic_molecular_weight.describe()

In [None]:
def get_metrics(df):
    mw = df.monoisotopic_molecular_weight.describe()
    logP = df.logP.describe()

    metrics = {
        'monoisotopic_molecular_weight': mw,
        'logP': logP,
    }
    return metrics

In [None]:
get_metrics(results_not_ionized)

In [None]:
get_metrics(results_m)

In [None]:
get_metrics(results_mh)

In [None]:
get_metrics(results_both)