# Automated Multiple Reaction Monitoring (MRM)-profiling and Ozone Electrospray Ionizaton (OzESI)-MRM Informatics Platform for High-throughput Lipidomics


In this jupyter notebook you will automate the data analysis of the lipidome. This is a challenging problem to perform manually due to the diverse nature of lipids and the many potential isomers. In this notebook you will analyze mzML files containing data from lipid MRMs, with ozone off and ozone on. The goal is to identify possible double-bond locations in a lipid, in this case a TAG (triacylglycerols).

In [1]:
from IPython.display import Image

![title](Figures/agilent_lcms.png)

The examples shown here were run on an Agilent 6495C Triple Quadrupole LC/MS (example shown above) that has been connected to an ozone line (not shown in picture) for ozoneolysis of lipids.

![title](Figures/TAG_example.png)
Here is an example of a TAG. Notice how many possibilities there are for locations of one double-bond there could be and how convoluted the analysis can become! This image is obtained from LipidMaps.org

Import all necessary libraries

In [4]:
#Import all the necessary libraries
import pymzml
import csv
import os
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import re
import plotly.express as px
from collections import defaultdict



MAKE CLASSES FOR EACH LIPID

In [35]:
#Function to read in MRM database
#Option to remove STDs from database##Not finished need option to use another database with no qualitative ACs
def read_mrm_list(filename,remove_std = True):
    mrm_list_new = pd.read_excel(filename, sheet_name=None)
    mrm_list_new = pd.concat(mrm_list_new, ignore_index=True)
    mrm_list_offical = mrm_list_new[['Compound Name', 'Parent Ion', 'Product Ion', 'Class']]
    # Add underscore to middle of columns names
    mrm_list_offical.columns = mrm_list_offical.columns.str.replace(' ', '_')
    # Round Parent Ion and Product Ion to 1 decimal place
    mrm_list_offical['Parent_Ion'] = np.round(mrm_list_offical['Parent_Ion'],1)
    mrm_list_offical['Product_Ion'] = np.round(mrm_list_offical['Product_Ion'],1)
    # Create transition column by combining Parent Ion and Product Ion with arrow between numbers
    mrm_list_offical['Transition'] = mrm_list_offical['Parent_Ion'].astype(str) + ' -> ' + mrm_list_offical['Product_Ion'].astype(str)
    # Change column compound name to lipid
    mrm_list_offical = mrm_list_offical.rename(columns={'Compound_Name': 'Lipid'})
    # Make a column called Class match lipid column to lipid types
    if remove_std == True:
        lipid_class = mrm_list_offical['Class'].unique()
        lipid_class_to_keep = ['PS','PG','CE','PC', 'DAG', 'PE', 'TAG', 'FA', 'Cer', 'CAR', 'PI','SM']
        mrm_list_offical = mrm_list_offical[mrm_list_offical['Class'].isin(lipid_class_to_keep)]
    return mrm_list_offical

mrm_database = read_mrm_list('lipid_database/Lipid_Database.xlsx')
mrm_database.tail()
##

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical['Parent_Ion'] = np.round(mrm_list_offical['Parent_Ion'],1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical['Product_Ion'] = np.round(mrm_list_offical['Product_Ion'],1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical['Transition'] = mrm_list_offical['

Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Class,Transition
3251,FA(38:6),551.5,551.5,FA,551.5 -> 551.5
3252,FA(38:5),553.5,553.5,FA,553.5 -> 553.5
3253,FA(38:4),555.5,555.5,FA,555.5 -> 555.5
3254,FA(40:6),579.5,579.5,FA,579.5 -> 579.5
3255,FA(42:6),607.5,607.5,FA,607.5 -> 607.5


In [36]:
list_of_lipid_classes = mrm_database['Class'].unique()
print(list_of_lipid_classes)

['PC' 'PE' 'SM' 'Cer' 'CAR' 'TAG' 'DAG' 'PS' 'PI' 'PG' 'CE' 'FA']


Load mzML file and convert to pandas dataframe and csv file. |
Columns = Q1, Q3, Intensity, Transition, Lipid, Class  |
Parsed data is also stored as csv file in data_csv

In [37]:

# data_folder = os.listdir('./data_mzml/liver_LD/sample/') #Path to the mzml files
# data_folder.sort()
# path_to_mzml_files = './data_mzml/liver_LD/sample/'
# #data_dict = {} #Empty dictionary to store all the data
# df = pd.DataFrame(columns=['Lipid','Parent_Ion','Product_Ion','Intensity','Transition','Class','Sample_ID'])
# #Create a similar for loop, except store all data in a single pandas dataframe
# #df_all = pd.DataFrame(columns=['Q1','Q3','Intensity','Transition','Lipid','Class']) #Create empty pandas dataframe to store the data



def mzml_parser(file_name='./data_mzml/liver_LD/sample/'):
    df = pd.DataFrame(columns=['Lipid','Parent_Ion','Product_Ion','Intensity','Transition','Class','Sample_ID'])
    data_folder = os.listdir(file_name) #Path to the mzml files
    data_folder.sort()
    path_to_mzml_files = file_name
    ##My edit
    for file in data_folder:
            if file.endswith('.mzML'):
                    print(file)
                    run = pymzml.run.Reader(path_to_mzml_files+file, skip_chromatogram=False) #Load the mzml file into the run object
                    print('Spectrum # = ',run.get_spectrum_count())
                    print('Chromatogram # =',run.get_chromatogram_count())


                    df_all = pd.DataFrame(columns=['Lipid','Parent_Ion','Product_Ion','Intensity','Transition','Class','Sample_ID']) #Create empty pandas dataframe to store the data

                    #create pandas dataframe to store the data with the columns Parent Ion, Product Ion, Intensity, Transition Lipid and Class
                    #df_sample = pd.DataFrame(columns=['Parent_Ion','Product_Ion','Intensity','Transition','Lipid','Class']) #Create empty pandas dataframe to store the data
                    #df_sample = pd.DataFrame(columns=['Q1','Q3','Intensity','Transition','Lipid','Class']) #Create empty pandas dataframe to store the data
                    q1_mz = 0 #Create empty variables to store the Q1 and Q3 m/z values
                    q3_mz = 0
                    count = 0 #Create a counter to keep track of the number of transitions
                    for spectrum in run:

                            ###
                            # if isinstance(spectrum,pymzml.spec.Chromatogram):
                            #         for time, intensity in spectrum.peaks():
                            #                 print(time, intensity)
                            #                 OzESI_time[time] = intensity
                            #         # OzESI_time.append(time_list)
                            ###

                            for element in spectrum.ID.split(' '):
                                    # print('element',element)
                                    intensity_store = np.array([])
                                    if 'Q1' in element:
                                            #print('Q1',element)
                                            q1 = element.split('=')
                                            #print('q1',q1[1])
                                            q1_mz= np.round((float(q1[1])),1)
                                            # print('q1',q1)

                                    if 'Q3' in element:
                                            #print('Q3',element)
                                            q3 = element.split('=')
                                            #print('q3',q3[1])
                                            q3_mz=np.round(float(q3[1]),1)
                                            # print('q3',q3)
                                            # df_sample.loc[count,'Q1'] = q1_mz
                                            # df_sample.loc[count,'Q3'] = q3_mz

                                            for mz,intensity in spectrum.peaks(): #Get the m/z and intensity values from the spectrum
                                                    intensity_store = np.append(intensity_store,intensity) #Store the intensity values in an array



                                    if 'Q3' in element:
                                            # print(intensity_sum)
                                            intensity_sum = np.sum(intensity_store) #Sum the intensity values
                                            df_all.loc[count,'Parent_Ion'] = q1_mz #Store the Q1 and Q3 m/z values in the pandas dataframe
                                            df_all.loc[count,'Product_Ion'] = q3_mz
                                            #round the Q1 and Q3 m/z values to 1 decimal places
                                            df_all.loc[count,'Parent_Ion'] = np.round(df_all.loc[count,'Parent_Ion'],1)
                                            df_all.loc[count,'Product_Ion'] = np.round(df_all.loc[count,'Product_Ion'],1)
                                            df_all.loc[count,'Intensity'] = intensity_sum #Store the intensity values in the pandas dataframe
                                            df_all.loc[count,'Transition'] = str(q1_mz)+ ' -> '+ str(q3_mz) #Store the transition values in the pandas dataframe
                                            #add file name to Sample_ID column without the mzmL extension
                                            df_all.loc[count,'Sample_ID'] = file[:-5]
                                            count+=1

            #append df_all to df_all2
            df = df.append(df_all, ignore_index=True)
    return df
df = mzml_parser(file_name='./data_mzml/liver_LD/sample/')
df.tail(5) 
len(df)

AC_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 87


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


AC_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 87
AC_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 87


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


AC_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 87
AC_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 87


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


AC_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 87
CER_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 167


  df = df.append(df_all, ignore_index=True)


CER_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 167


  df = df.append(df_all, ignore_index=True)


CER_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 167


  df = df.append(df_all, ignore_index=True)


CER_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 167


  df = df.append(df_all, ignore_index=True)


CER_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 167


  df = df.append(df_all, ignore_index=True)


CER_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 167


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


CER_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
CE_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 38


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


CE_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 38
CE_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 38


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


CE_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 38
CE_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 38


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


CE_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 38
CE_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15


  df = df.append(df_all, ignore_index=True)


DG-160_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 109


  df = df.append(df_all, ignore_index=True)


DG-160_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 109
DG-160_FAD131-5xFAD-M2liver_033123_r006.mzML


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


Spectrum # =  None
Chromatogram # = 109
DG-160_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 109


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-160_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 109
DG-160_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 109
DG-160_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-161_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 98
DG-161_FAD131-5xFAD-M1liver_033123_r009.mzML


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


Spectrum # =  None
Chromatogram # = 98
DG-161_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 98


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-161_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 98
DG-161_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 98
DG-161_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 98


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-161_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
DG-180_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 109


  df = df.append(df_all, ignore_index=True)


DG-180_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 109


  df = df.append(df_all, ignore_index=True)


DG-180_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 109


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-180_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 109
DG-180_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 109
DG-180_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 109


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-180_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
DG-181_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 98


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-181_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 98
DG-181_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 98
DG-181_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 98


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-181_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 98
DG-181_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 98
DG-181_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-182_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 88
DG-182_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 88


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-182_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 88


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-182_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 88
DG-182_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 88


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


DG-182_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 88
DG-182_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15


  df = df.append(df_all, ignore_index=True)


FFA_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 182


  df = df.append(df_all, ignore_index=True)


FFA_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 182


  df = df.append(df_all, ignore_index=True)


FFA_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 182


  df = df.append(df_all, ignore_index=True)


FFA_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 182


  df = df.append(df_all, ignore_index=True)


FFA_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 182


  df = df.append(df_all, ignore_index=True)


FFA_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 182


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


FFA_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
PC_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 183


  df = df.append(df_all, ignore_index=True)


PC_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 183


  df = df.append(df_all, ignore_index=True)


PC_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 183


  df = df.append(df_all, ignore_index=True)


PC_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 183


  df = df.append(df_all, ignore_index=True)


PC_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 183


  df = df.append(df_all, ignore_index=True)


PC_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 183


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


PC_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
PE_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 150


  df = df.append(df_all, ignore_index=True)


PE_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 150


  df = df.append(df_all, ignore_index=True)


PE_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 150


  df = df.append(df_all, ignore_index=True)


PE_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 150


  df = df.append(df_all, ignore_index=True)


PE_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 150


  df = df.append(df_all, ignore_index=True)


PE_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 150


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


PE_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
PG_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 145


  df = df.append(df_all, ignore_index=True)


PG_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 145


  df = df.append(df_all, ignore_index=True)


PG_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 145


  df = df.append(df_all, ignore_index=True)


PG_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 145


  df = df.append(df_all, ignore_index=True)


PG_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 145


  df = df.append(df_all, ignore_index=True)


PG_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 145


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


PG_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
PI_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 287


  df = df.append(df_all, ignore_index=True)


PI_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 287


  df = df.append(df_all, ignore_index=True)


PI_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 287


  df = df.append(df_all, ignore_index=True)


PI_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 287


  df = df.append(df_all, ignore_index=True)


PI_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 287


  df = df.append(df_all, ignore_index=True)


PI_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 287


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


PI_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
PS_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


PS_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


PS_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


PS_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


PS_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


PS_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


PS_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG140_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG140_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG140_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG140_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG140_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG140_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG140_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG160_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG160_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG160_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG160_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG160_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG160_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG160_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG161_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG161_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG161_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG161_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG161_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG161_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG161_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG180_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG180_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG180_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG180_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG180_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)


TAG180_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 185


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG180_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG181_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG181_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG181_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG181_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG181_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)


TAG181_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 173


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG181_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG182_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 163


  df = df.append(df_all, ignore_index=True)


TAG182_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 163


  df = df.append(df_all, ignore_index=True)


TAG182_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 163


  df = df.append(df_all, ignore_index=True)


TAG182_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 163


  df = df.append(df_all, ignore_index=True)


TAG182_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 163


  df = df.append(df_all, ignore_index=True)


TAG182_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 163


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG182_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG183_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 153


  df = df.append(df_all, ignore_index=True)


TAG183_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 153


  df = df.append(df_all, ignore_index=True)


TAG183_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 153


  df = df.append(df_all, ignore_index=True)


TAG183_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 153


  df = df.append(df_all, ignore_index=True)


TAG183_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 153


  df = df.append(df_all, ignore_index=True)


TAG183_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 153


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG183_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG204_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


TAG204_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


TAG204_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


TAG204_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


TAG204_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)


TAG204_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 143


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG204_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG225_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 131


  df = df.append(df_all, ignore_index=True)


TAG225_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 131


  df = df.append(df_all, ignore_index=True)


TAG225_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 131


  df = df.append(df_all, ignore_index=True)


TAG225_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 131


  df = df.append(df_all, ignore_index=True)


TAG225_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 131


  df = df.append(df_all, ignore_index=True)


TAG225_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 131


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


TAG225_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15
TAG226_DOD73-5xFAD-M2liver_033123_r008.mzML
Spectrum # =  None
Chromatogram # = 120


  df = df.append(df_all, ignore_index=True)


TAG226_FAD131-5xFAD-M1liver_033123_r009.mzML
Spectrum # =  None
Chromatogram # = 120


  df = df.append(df_all, ignore_index=True)


TAG226_FAD131-5xFAD-M2liver_033123_r006.mzML
Spectrum # =  None
Chromatogram # = 120


  df = df.append(df_all, ignore_index=True)


TAG226_FAD131-5xFAD-M4liver_033123_r007.mzML
Spectrum # =  None
Chromatogram # = 120


  df = df.append(df_all, ignore_index=True)


TAG226_FAD173-5xFAD-M1liver_033123_r005.mzML
Spectrum # =  None
Chromatogram # = 120


  df = df.append(df_all, ignore_index=True)


TAG226_blank_033123_r004.mzML
Spectrum # =  None
Chromatogram # = 120
TAG226_equisplash_033123_r001.mzML
Spectrum # =  None
Chromatogram # = 15


  df = df.append(df_all, ignore_index=True)
  df = df.append(df_all, ignore_index=True)


20814

Load MRM transitions from csv file to pandas dataframe. This list will be used to identify the possible lipids in our sample.

In [29]:
list1 = df['Sample_ID'].unique()
list1.sort()
list2 = list(df['Sample_ID'])
for i in list1:
    print(list2.count(i),i)

84 AC_DOD73-5xFAD-M2liver_033123_r008
84 AC_FAD131-5xFAD-M1liver_033123_r009
84 AC_FAD131-5xFAD-M2liver_033123_r006
84 AC_FAD131-5xFAD-M4liver_033123_r007
84 AC_FAD173-5xFAD-M1liver_033123_r005
84 AC_blank_033123_r004
164 CER_DOD73-5xFAD-M2liver_033123_r008
164 CER_FAD131-5xFAD-M1liver_033123_r009
164 CER_FAD131-5xFAD-M2liver_033123_r006
164 CER_FAD131-5xFAD-M4liver_033123_r007
164 CER_FAD173-5xFAD-M1liver_033123_r005
164 CER_blank_033123_r004
12 CER_equisplash_033123_r001
35 CE_DOD73-5xFAD-M2liver_033123_r008
35 CE_FAD131-5xFAD-M1liver_033123_r009
35 CE_FAD131-5xFAD-M2liver_033123_r006
35 CE_FAD131-5xFAD-M4liver_033123_r007
35 CE_FAD173-5xFAD-M1liver_033123_r005
35 CE_blank_033123_r004
12 CE_equisplash_033123_r001
106 DG-160_DOD73-5xFAD-M2liver_033123_r008
106 DG-160_FAD131-5xFAD-M1liver_033123_r009
106 DG-160_FAD131-5xFAD-M2liver_033123_r006
106 DG-160_FAD131-5xFAD-M4liver_033123_r007
106 DG-160_FAD173-5xFAD-M1liver_033123_r005
106 DG-160_blank_033123_r004
12 DG-160_equisplash_033123

In [38]:
# Function to create an ion dictionary from an MRM database DataFrame
def create_ion_dict(mrm_database):
    ion_dict = defaultdict(list)
    # Iterate through the rows of the MRM database DataFrame
    for index, row in mrm_database.iterrows():
        # Add a tuple with Lipid and Class to the ion dictionary using Parent_Ion and Product_Ion as the key
        ion_dict[(row['Parent_Ion'], row['Product_Ion'])].append((row['Lipid'], row['Class']))
    return ion_dict

# Function to check if the absolute difference between two values is within a given tolerance
def within_tolerance(a, b, tolerance=0.1):
    return abs(a - b) <= tolerance

# Function to match the ions in a DataFrame row with the ions in an ion dictionary
def match_ions(row, ion_dict, tolerance=0.1):
    ions = (row['Parent_Ion'], row['Product_Ion'])
    matched_lipids = []
    matched_classes = []

    # Iterate through the ion dictionary
    for key, value in ion_dict.items():
        # Check if both the Parent_Ion and Product_Ion values are within the specified tolerance
        if within_tolerance(ions[0], key[0], tolerance) and within_tolerance(ions[1], key[1], tolerance):
            # If within tolerance, extend the matched_lipids and matched_classes lists with the corresponding values
            matched_lipids.extend([match[0] for match in value])
            matched_classes.extend([match[1] for match in value])

    # If any matches were found, update the Lipid and Class columns in the row
    if matched_lipids and matched_classes:
        row['Lipid'] = ' | '.join(matched_lipids)
        row['Class'] = ' | '.join(matched_classes)

    return row

ion_dict = create_ion_dict(mrm_database)
# Assuming you have the df DataFrame to apply the match_ions function
df_matched = df.apply(lambda row: match_ions(row, ion_dict=ion_dict, tolerance=0.3), axis=1)
len(df_matched)

df_matched = df_matched.dropna()

print(len(df))
print(len(df_matched))
df_matched.head(None)
print(len(df))
print(len(df_matched))
df_matched.head(None)

20814
20405
20814
20405


Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID
0,CAR_QUAL,162.2,60.1,3.450258e+04,162.2 -> 60.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
1,CAR,162.2,85.1,4.369762e+04,162.2 -> 85.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
2,CAR(2:0)_QUAL,204.1,60.1,1.375392e+05,204.1 -> 60.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
3,CAR(2:0),204.1,85.1,3.777689e+06,204.1 -> 85.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
4,CAR(3:1)_QUAL,216.1,60.1,6.410654e+04,216.1 -> 60.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
...,...,...,...,...,...,...,...
20798,"[TG(66:9),TG(65:2)]_FA22:6",1059.0,713.7,1.457024e+04,1059.0 -> 713.7,TAG,TAG226_blank_033123_r004
20799,"[TG(66:8),TG(65:1)]_FA22:6",1061.0,715.7,1.295954e+04,1061.0 -> 715.7,TAG,TAG226_blank_033123_r004
20800,[TG(66:7)]_FA22:6,1062.9,717.6,1.432416e+04,1062.9 -> 717.6,TAG,TAG226_blank_033123_r004
20801,[TG(66:6)]_FA22:6,1065.0,719.7,1.238056e+04,1065.0 -> 719.7,TAG,TAG226_blank_033123_r004


In [16]:
df_matched.head(None)

Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID
0,CAR_QUAL,162.2,60.1,3.508602e+04,162.2 -> 60.1,CAR,AC_FAD131-5xFAD-M4liver_033123_r007
1,CAR,162.2,85.1,4.492750e+04,162.2 -> 85.1,CAR,AC_FAD131-5xFAD-M4liver_033123_r007
2,CAR(2:0)_QUAL,204.1,60.1,1.404356e+05,204.1 -> 60.1,CAR,AC_FAD131-5xFAD-M4liver_033123_r007
3,CAR(2:0),204.1,85.1,3.860603e+06,204.1 -> 85.1,CAR,AC_FAD131-5xFAD-M4liver_033123_r007
4,CAR(3:1)_QUAL,216.1,60.1,6.504970e+04,216.1 -> 60.1,CAR,AC_FAD131-5xFAD-M4liver_033123_r007
...,...,...,...,...,...,...,...
45632,PI(44:2),992.7,715.7,4.943260e+03,992.7 -> 715.7,PI,PI_FAD173-5xFAD-M1liver_033123_r005
45633,PI(44:1),994.7,717.7,5.440140e+03,994.7 -> 717.7,PI,PI_FAD173-5xFAD-M1liver_033123_r005
45634,PI(44:1),994.7,717.7,6.339521e+03,994.7 -> 717.7,PI,PI_FAD173-5xFAD-M1liver_033123_r005
45635,PI(44:0),996.7,719.7,5.170680e+03,996.7 -> 719.7,PI,PI_FAD173-5xFAD-M1liver_033123_r005


In [40]:
df_matched.to_csv("New_DF_3_tolerance.csv")
print(len(df))
print(len(df_matched))
df.head(None)

20814


Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID
0,CAR_QUAL,162.2,60.1,34502.582428,162.2 -> 60.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
1,CAR,162.2,85.1,43697.622925,162.2 -> 85.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
2,CAR(2:0)_QUAL,204.1,60.1,137539.249512,204.1 -> 60.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
3,CAR(2:0),204.1,85.1,3777688.960938,204.1 -> 85.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
4,CAR(3:1)_QUAL,216.1,60.1,64106.544678,216.1 -> 60.1,CAR,AC_DOD73-5xFAD-M2liver_033123_r008
...,...,...,...,...,...,...,...
20809,,753.5,184.1,520997856.301666,753.5 -> 184.1,,TAG226_equisplash_033123_r001
20810,,755.5,570.4,142358266.970703,755.5 -> 570.4,,TAG226_equisplash_033123_r001
20811,,759.5,570.4,144748851.990967,759.5 -> 570.4,,TAG226_equisplash_033123_r001
20812,,829.8,570.4,363229447.51355,829.8 -> 570.4,,TAG226_equisplash_033123_r001


In [21]:
########
def save_dataframe(df, folder_name, file_name, max_attempts=5):
    folder_path = f'data_results/data/data_matching/{folder_name}'
    os.makedirs(folder_path, exist_ok=True)

    for i in range(max_attempts):
        file_path = f'{folder_path}/{file_name}_{i}.xlsx'
        if not os.path.isfile(file_path):
            df.to_excel(file_path, index=False)
            print(f"Saved DataFrame to {file_path}")
            break
    else:
        print(f"Failed to save DataFrame after {max_attempts} attempts.")
        return None

# Example usage:
folder_name = 'TEST_TEST'
file_name = 'TEST'
df_matching = pd.DataFrame() # Replace with your DataFrame

save_dataframe(df_matching, folder_name, file_name)


Saved DataFrame to data_results/data/data_matching/TEST_TEST/TEST_0.xlsx


In [22]:
#import visualization libraries
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
#Plotting functions

def plot_transition_vs_intensity(df):
    fig = px.bar(df, x="Transition", y="Intensity", color="Lipid", hover_data=['Lipid', 'Class'])
    fig.show()

def plot_class_vs_intensity_bar(df):
    fig = px.bar(df, x="Class", y="Intensity", color="Class", hover_data=['Lipid', 'Class'])
    fig.show()

def plot_class_vs_intensity_pie(df):
    fig = px.pie(df, values='Intensity', names='Class', title='Lipid Class')
    fig.show()

def plot_intensity_heatmap(df):
    fig = go.Figure(data=go.Heatmap(
        z=df['Intensity'],
        x=df['Lipid'],
        y=df['Class'],
        colorscale='Viridis'))
    fig.show()

# Example usage:
# Assuming you have the df_matching DataFrame
plot_transition_vs_intensity(df_matched)
plot_class_vs_intensity_bar(df_matched)
plot_class_vs_intensity_pie(df_matched)
plot_intensity_heatmap(df_matched)
