# UNIFI library for processing of MS-spectra
Two new libraries for halogenated and non-halogenated compounds will be made to make sure that all peaks are processed correctly and more efficiently for the processing of the spectra. 

In [2]:
import pandas as pd
import numpy as np

import pickle

In [3]:
#Import the data
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Code/Experimental_work/2025-04-25_experimental_analysis_compounds.pkl', 'rb') as f:
    actives = pickle.load(f)

In [4]:
actives

Unnamed: 0,name,cas,section,nr.ahr,sr.mmp,amount,unit,building,floor,room,...,HBA,HBD,pubchem_cid,gcms_spectra_available,lcms_spectra_available,boiling_point_peer_reviewed,boiling_point,vapor_pressure,solubility,mixnr
0,Indene (mass),95-13-6,Group Kálmán J Szabó,1.0,,250.0,g,Arrheniuslaboratoriet Hus A,Plan 5,A516 i övrigt,...,0,0,7219.0,True,False,[181.6 °C 760 MM HG],"[359 °F at 760 mmHg (NIOSH, 2024), 359 °F, 359...",[1.1 mm Hg at 25 °C],"[Insoluble (NIOSH, 2024), INSOL IN WATER, MISC...",1
1,alpha-Tetralone (volume),529-34-0,Group Joseph Samec,1.0,0.0,100.0,ml,Arrheniuslaboratoriet Hus A,Plan 6,A659,...,1,0,10724.0,True,False,[255-257 °C @ 760 MM HG],[],[0.02 MM HG @ 20 °C],[INSOL IN WATER],1
2,4-Chlorophenyl isocyanate,104-12-1,Group Kálmán J Szabó,,1.0,100.0,g,Arrheniuslaboratoriet Hus A,Plan 5,A516 i övrigt,...,2,0,7693.0,True,False,"[204 °C, BP: 116 °C at 45 mm Hg]",[],"[VP: 95.4 mm Hg at 20 °C, 19.4 mm Hg at 25 °C ...",[Soluble in organic solvents],1
3,2-Chloroacetophenone,532-27-4,Group Kálmán J Szabó,1.0,0.0,100.0,g,Arrheniuslaboratoriet Hus A,Plan 5,A516 i övrigt,...,1,0,10757.0,True,False,"[441 to 442 °F at 760 mmHg (NTP, 1992), 247 °C]","[244-245 °C, 472 °F, 472 °F]","[0.0054 mmHg at 68 °F (NTP, 1992), Vapor press...","[less than 1 mg/mL at 66 °F (NTP, 1992), Pract...",1
4,cis-Stilbene (mass),645-49-8,Group Joseph Samec,1.0,0.0,5.0,g,Arrheniuslaboratoriet Hus A,Plan 6,A652 i övrigt,...,0,0,11502.0,True,False,[],[],[],[],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Triphenylethylene,58-72-0,Group Pher Andersson,,1.0,5.0,g,Arrheniuslaboratoriet Hus A,Plan 6,A626 i övrigt,...,0,0,6025.0,True,False,[],[],[],[],10
95,"o,p'-DDT",789-02-6,"ACESo, Contaminant Chemistry Unit",0.0,1.0,1,g,Arrheniuslaboratoriet Hus A,Plan 2,A259,...,0,0,13089.0,True,False,[],[],[],[],10
96,Tris(dimethylphenyl)-phosphate,25155-23-1,"ACESo, Contaminant Chemistry Unit",1.0,,0.1,ml,Arrheniuslaboratoriet Hus A,Plan 2,A259,...,4,0,33133.0,False,False,[290 °C at 10 mm Hg],[],[],[Sol in acetic acid],10
97,"2,4-Dimethylaniline (mass)",95-68-1,Group Pher Andersson,1.0,0.0,100.0,g,Arrheniuslaboratoriet Hus A,Plan 6,A626 i övrigt,...,1,2,7250.0,True,True,"[417 °F at 760 mmHg (NTP, 1992), 214 °C @ 760 ...",[214 °C],[1 mmHg at 126.7 °F ; 5 mmHg at 175.6 °F; 760 ...,"[1 to 5 mg/mL at 63 °F (NTP, 1992), Sol in alc...",11


In [5]:
unifi_library_combined = actives[['name', 'molecular_formula', 'monoisotopic_molecular_weight', 'ROMol']].copy()

In [6]:
#Making all of the necessary columns for the unifi library
unifi_library_combined = unifi_library_combined.rename(columns={'name':'Item Name',
                                                                'molecular_formula':'Formula',
                                                                'monoisotopic_molecular_weight':'F1'})

unifi_library_combined['Comment'] = unifi_library_combined['Item Name']
unifi_library_combined['Description'] = np.nan #not needed
unifi_library_combined['Adduct'] = np.nan #will be added in the software
unifi_library_combined['Fragmentation'] = 'CID'
unifi_library_combined['F2'] = np.nan #not needed
unifi_library_combined['F3'] = np.nan #not needed
unifi_library_combined['RT'] = np.nan #not needed
unifi_library_combined['Structure'] = np.nan #updated in next step


In [7]:
#Export the mol objects for .mol files and add the file name to the dataframe

import rdkit
from rdkit import Chem
from rdkit.Chem import rdDepictor

def export_smiles(mol, name):
    """
    Function to export the RDKit molecule object to a .mol file and return the file name.
    """

    #clean name for file name
    def clean_name(name):
        '''
        Cleans name by removing unwamted formatting and characters for use as a file name
        '''
        name = name.replace(' (mass)', '').replace(' (massa)', '').replace(' (volume)', '').replace(' (Sigma-Aldrich 808314)', '').replace(' (Sigma-Aldrich 800458)', '').replace( "'", '').replace('<br>', '')
        name = name.replace(' ', '_')
        name = name.replace(',', '-').replace(':','-').replace('[', '-').replace(']','-')
        name = name.lower()

        return name

    name = clean_name(name)

    #Add hydrogens to the molecule and compute 2D coordinates
    mol = Chem.AddHs(mol)
    Chem.rdDepictor.Compute2DCoords(mol)

    #Export the molecule to a .mol file
    Chem.MolToMolFile(mol, f'{name}.mol')
    
    print(f'Exported {name}')

    return f'{name}.mol'

In [8]:
#Iterate over the dataframe and export the molecules to .mol files and add the file names to the dataframe

for index in unifi_library_combined.index:
    #Get the molecule from the RDKit object
    mol = unifi_library_combined.loc[index, 'ROMol']

    #Export the molecule to a .mol file
    mol_file = export_smiles(mol, unifi_library_combined.loc[index, 'Item Name'])

    #Add the file name to the dataframe
    unifi_library_combined.loc[index, 'Structure'] = mol_file

    print(f'Added {mol_file} to the dataframe \n {index+1} of {len(unifi_library_combined)}')

Exported indene
Added indene.mol to the dataframe 
 1 of 99
Exported alpha-tetralone
Added alpha-tetralone.mol to the dataframe 
 2 of 99
Exported 4-chlorophenyl_isocyanate
Added 4-chlorophenyl_isocyanate.mol to the dataframe 
 3 of 99
Exported 2-chloroacetophenone
Added 2-chloroacetophenone.mol to the dataframe 
 4 of 99
Exported cis-stilbene
Added cis-stilbene.mol to the dataframe 
 5 of 99
Exported triphenylborane
Added triphenylborane.mol to the dataframe 
 6 of 99
Exported p-chloranil
Added p-chloranil.mol to the dataframe 
 7 of 99
Exported thiourea
Added thiourea.mol to the dataframe 
 8 of 99
Exported 1-3-phenylenediamine
Added 1-3-phenylenediamine.mol to the dataframe 
 9 of 99
Exported 2-3-diaminotoluene
Added 2-3-diaminotoluene.mol to the dataframe 
 10 of 99
Exported n-n-dimethyl-p-toluidine
Added n-n-dimethyl-p-toluidine.mol to the dataframe 
 11 of 99
Exported benzhydrazide
Added benzhydrazide.mol to the dataframe 
 12 of 99
Exported n-n-dimethyl-p-phenylenediamine
Added 

  unifi_library_combined.loc[index, 'Structure'] = mol_file


In [9]:
#Reorder the columns to match the Unifi library format
unifi_library_combined_reformated = unifi_library_combined[['Item Name', 'Formula', 'Comment', 'Description', 'Adduct', 'Fragmentation', 'F1', 'F2', 'F3', 'RT', 'Structure']]

unifi_library_combined_reformated.head()

Unnamed: 0,Item Name,Formula,Comment,Description,Adduct,Fragmentation,F1,F2,F3,RT,Structure
0,Indene (mass),C9H8,Indene (mass),,,CID,116.0626,,,,indene.mol
1,alpha-Tetralone (volume),C10H10O,alpha-Tetralone (volume),,,CID,146.073165,,,,alpha-tetralone.mol
2,4-Chlorophenyl isocyanate,C7H4ClNO,4-Chlorophenyl isocyanate,,,CID,152.998141,,,,4-chlorophenyl_isocyanate.mol
3,2-Chloroacetophenone,C8H7ClO,2-Chloroacetophenone,,,CID,154.018543,,,,2-chloroacetophenone.mol
4,cis-Stilbene (mass),C14H12,cis-Stilbene (mass),,,CID,180.0939,,,,cis-stilbene.mol


In [10]:
#Get the names of the actives with halogens and non-halogens
actives_halogenated_names = actives[(actives.Chlorines > 2)|(actives.Bromines > 1)]['name'].tolist() #25 halogenated compounds
actives_nonhalogens_names = actives[(actives.Chlorines == 0)&(actives.Bromines == 0)]['name'].tolist() #74 non-halogenated compounds

#Create a library with the halogenated and non-halogenated compounds
halogenated_library = unifi_library_combined_reformated[unifi_library_combined_reformated['Item Name'].isin(actives_halogenated_names)].copy() #25 halogenated compounds
nonhalogenated_library = unifi_library_combined_reformated[unifi_library_combined_reformated['Item Name'].isin(actives_nonhalogens_names)].copy() #74 non-halogenated compounds

In [12]:
#Export to excel files
halogenated_library.to_excel('2025-04-29_UNIFI_library_above_2Cl_1Br.xlsx', index=False)
nonhalogenated_library.to_excel('2025-04-29_UNIFI_library_below_3Cl_2Br.xlsx', index=False)